home *** CD-ROM | disk | FTP | other *** search
/ .net 2002 March / DotNetMagazine-Issue107-Coverdisc-NET107-02-03-PCMac.bin / pc / PC Software / picks / HTTrack / httrack-3.22-3.exe / {app} / src / htsparse.c < prev    next >
Encoding:
C/C++ Source or Header  |  2002-11-17  |  108.7 KB  |  2,381 lines

  1. /* ------------------------------------------------------------ */
  2. /*
  3. HTTrack Website Copier, Offline Browser for Windows and Unix
  4. Copyright (C) Xavier Roche and other contributors
  5.  
  6. This program is free software; you can redistribute it and/or
  7. modify it under the terms of the GNU General Public License
  8. as published by the Free Software Foundation; either version 2
  9. of the License, or any later version.
  10.  
  11. This program is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14. GNU General Public License for more details.
  15.  
  16. You should have received a copy of the GNU General Public License
  17. along with this program; if not, write to the Free Software
  18. Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  19.  
  20.  
  21. Important notes:
  22.  
  23. - We hereby ask people using this source NOT to use it in purpose of grabbing
  24. emails addresses, or collecting any other private information on persons.
  25. This would disgrace our work, and spoil the many hours we spent on it.
  26.  
  27.  
  28. Please visit our Website: http://www.httrack.com
  29. */
  30.  
  31.  
  32. /* ------------------------------------------------------------ */
  33. /* File: Main source                                            */
  34. /* DIRECT INCLUDE TO httrack.c                                  */
  35. /* Author: Xavier Roche                                         */
  36. /* ------------------------------------------------------------ */
  37.  
  38.  
  39. #if HTS_ANALYSTE
  40. if (hts_htmlcheck(r.adr,(int)r.size,urladr,urlfil)) {
  41. #endif          
  42.   FILE* fp=NULL;      // fichier Ècrit localement 
  43.   char* adr=r.adr;    // pointeur (on parcourt)
  44.   char* lastsaved;    // adresse du dernier octet sauvÈ + 1
  45.   if ( (opt.debug>1) && (opt.log!=NULL) ) {
  46.     fspc(opt.log,"debug"); fprintf(opt.log,"scan file.."LF); test_flush;
  47.   }
  48.  
  49.  
  50.   // Indexing!
  51. #if HTS_MAKE_KEYWORD_INDEX
  52.   if (opt.kindex) {
  53.     if (index_keyword(r.adr,r.size,r.contenttype,savename,opt.path_html)) {
  54.       if ( (opt.debug>1) && (opt.log!=NULL) ) {
  55.         fspc(opt.log,"debug"); fprintf(opt.log,"indexing file..done"LF); test_flush;
  56.       }
  57.     } else {
  58.       if ( (opt.debug>1) && (opt.log!=NULL) ) {
  59.         fspc(opt.log,"debug"); fprintf(opt.log,"indexing file..error!"LF); test_flush;
  60.       }
  61.     }
  62.   }
  63. #endif
  64.  
  65.   // Now, parsing
  66.   if ((opt.getmode & 1) && (ptr>0)) {  // rÈcupÈrer les html sur disque       
  67.     // crÈer le fichier html local
  68.     HT_ADD_FOP;   // Ècrire peu ‡ peu le fichier
  69.   }
  70.   
  71.   if (!error) {
  72.     int detect_title=0;  // dÈtection  du title
  73.     //
  74.     char* in_media=NULL; // in other media type (real media and so..)
  75.     int intag=0;         // on est dans un tag
  76.     int incomment=0;     // dans un <!--
  77.     int inscript=0;      // dans un scipt pour applets javascript)
  78.     int inscript_tag=0;  // on est dans un <body onLoad="... terminÈ par >
  79.     char inscript_tag_lastc='\0';  
  80.                            // terminaison (" ou ') du "<body onLoad=.."
  81.     int inscriptgen=0;     // on est dans un code gÈnÈrant, ex aprËs obj.write("..
  82.     char scriptgen_q='\0'; // caractËre faisant office de guillemet (' ou ")
  83.     int no_esc_utf=0;      // ne pas echapper chars > 127
  84.     int nofollow=0;        // ne pas scanner
  85.     //
  86.     int parseall_lastc='\0';    // dernier caractËre parsÈ pour parseall
  87.     int parseall_incomment=0;   // dans un /* */ (exemple: a = /* URL */ "img.gif";)
  88.     //
  89.     char* intag_start=adr;
  90.     char* intag_startattr=NULL;
  91.     int intag_start_valid=0;
  92.     HT_ADD_START;    // dÈbuter
  93.  
  94.  
  95.     /* statistics */
  96.     if ((opt.getmode & 1) && (ptr>0)) { 
  97.       /*
  98.       HTS_STAT.stat_files++;
  99.       HTS_STAT.stat_bytes+=r.size;
  100.       */
  101.     }
  102.  
  103.     /* Primary list or URLs */
  104.     if (ptr == 0) {
  105.       intag=1;
  106.       intag_start_valid=0;
  107.     }
  108.     /* Check is the file is a .js file */
  109.     else if (
  110.       (strfield2(r.contenttype,"application/x-javascript")!=0)
  111.       || (strfield2(r.contenttype,"text/css")!=0)
  112.       ) {      /* JavaScript js file */
  113.       inscript=1;
  114.       intag=1;     // because aprËs <script> on y est .. - pas utile
  115.       intag_start_valid=0;    // OUI car nous sommes dans du code, plus dans du "vrai" tag
  116.       if ((opt.debug>1) && (opt.log!=NULL)) {
  117.         fspc(opt.log,"debug"); fprintf(opt.log,"note: this file is a javascript file"LF); test_flush;
  118.       }
  119.     }
  120.     /* Or a real audio */
  121.     else if (strfield2(r.contenttype,"audio/x-pn-realaudio")!=0) {      /* realaudio link file */
  122.       inscript=intag=1;
  123.       intag_start_valid=0;
  124.       in_media="RAM";       // real media!
  125.     }
  126.     // Detect UTF8 format
  127.     if (is_unicode_utf8((unsigned char*) r.adr, (unsigned int) r.size) == 1) {
  128.       no_esc_utf=1;
  129.     } else {
  130.       no_esc_utf=0;
  131.     }
  132.     // Hack to prevent any problems with ram files of other files
  133.     * ( r.adr + r.size ) = '\0';
  134.  
  135.  
  136.     // ------------------------------------------------------------
  137.     // analyser ce qu'il y a en mÈmoire (fichier html)
  138.     // on scanne les balises
  139.     // ------------------------------------------------------------
  140. #if HTS_ANALYSTE
  141.     _hts_in_html_done=0;     // 0% scannÈs
  142.     _hts_cancel=0;           // pas de cancel
  143.     _hts_in_html_parsing=1;  // flag pour indiquer un parsing
  144. #endif
  145.     base[0]='\0';    // effacer base-href
  146.     lastsaved=adr;
  147.     do {
  148.       int p=0;
  149.       int valid_p=0;      // force to take p even if == 0
  150.       int ending_p='\0';  // ending quote?
  151.       error=0;
  152.  
  153.       /* Hack to avoid NULL char problems with C syntax */
  154.       /* Yes, some bogus HTML pages can embed null chars
  155.          and therefore can not be properly handled if this hack is not done
  156.       */
  157.       if ( ! (*adr) ) {
  158.         if ( ((int) (adr - r.adr)) < r.size)
  159.           *adr=' ';
  160.       }
  161.  
  162.  
  163.  
  164.       /*
  165.       index.html built here
  166.       */
  167.       // Construction index.html (sommaire)
  168.       // Avant de tester les a href,
  169.       // Ici on teste si l'on doit construire l'index vers le(s) site(s) miroir(s)
  170.       if (!makeindex_done) {  // autoriation d'Ècrire un index
  171.         if (!detect_title) {
  172.           if (opt.depth == liens[ptr]->depth) {    // on note toujours les premiers liens
  173.             if (!in_media) {
  174.               if (opt.makeindex && (ptr>0)) {
  175.                 if (opt.getmode & 1) {  // autorisation d'Ècrire
  176.                   p=strfield(adr,"title");  
  177.                   if (p) {
  178.                     if (*(adr-1)=='/') p=0;    // /title
  179.                   } else {
  180.                     if (strfield(adr,"/html"))
  181.                       p=-1;                    // noter, mais sans titre
  182.                     else if (strfield(adr,"body"))
  183.                       p=-1;                    // noter, mais sans titre
  184.                     else if ( ((int) (adr - r.adr) ) >= (r.size-1) )
  185.                       p=-1;                    // noter, mais sans titre
  186.                     else if ( (int) (adr - r.adr) >= r.size - 2)   // we got to hurry
  187.                       p=-1; // xxc xxc xxc
  188.                   }
  189.                 } else
  190.                   p=0;
  191.                 
  192.                 if (p) {    // ok center                            
  193.                   if (makeindex_fp==NULL) {
  194.                     verif_backblue(opt.path_html);    // gÈnÈrer gif
  195.                     makeindex_fp=filecreate(fconcat(opt.path_html,"index.html"));
  196.                     if (makeindex_fp!=NULL) {
  197.  
  198.                       // Header
  199.                       fprintf(makeindex_fp,template_header,
  200.                         "<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"
  201.                         );
  202.  
  203.                     } else makeindex_done=-1;    // fait, erreur
  204.                   }
  205.                   
  206.                   if (makeindex_fp!=NULL) {
  207.                     char tempo[HTS_URLMAXSIZE*2];
  208.                     char s[HTS_URLMAXSIZE*2];
  209.                     char* a=NULL;
  210.                     char* b=NULL;
  211.                     s[0]='\0';
  212.                     if (p>0) {
  213.                       a=strchr(adr,'>');
  214.                       if (a!=NULL) {
  215.                         a++;
  216.                         while(is_space(*a)) a++;    // sauter espaces & co
  217.                         b=strchr(a,'<');   // prochain tag
  218.                       }
  219.                     }
  220.                     if (lienrelatif(tempo,liens[ptr]->sav,concat(opt.path_html,"index.html"))==0) {
  221.                       detect_title=1;      // ok dÈtectÈ pour cette page!
  222.                       makeindex_links++;   // un de plus
  223.                       strcpybuff(makeindex_firstlink,tempo);
  224.                       //
  225.                       if ((b==a) || (a==NULL) || (b==NULL)) {    // pas de titre
  226.                         strcpybuff(s,tempo);
  227.                       } else if ((b-a)<256) {
  228.                         b--;
  229.                         while(is_space(*b)) b--;
  230.                         strncpy(s,a,b-a+1);
  231.                         *(s+(b-a)+1)='\0';
  232.                       }
  233.  
  234.                       // Body
  235.                       fprintf(makeindex_fp,template_body,
  236.                         tempo,
  237.                         s
  238.                         );
  239.  
  240.                     }
  241.                   }
  242.                 }
  243.               }
  244.             }
  245.             
  246.           } else if (liens[ptr]->depth<opt.depth) {   // on a sautÈ level1+1 et level1
  247.             HT_INDEX_END;
  248.           }
  249.         } // if (opt.makeindex)
  250.       }
  251.       // FIN Construction index.html (sommaire)
  252.       /*
  253.       end -- index.html built here
  254.       */
  255.       
  256.  
  257.  
  258.       /* Parse */
  259.       if (
  260.            (*adr=='<')    /* No starting tag */
  261.         && (!inscript)    /* Not in (java)script */
  262.         && (!incomment)   /* Not in comment (<!--) */
  263.       ) { 
  264.         intag=1;
  265.         parseall_incomment=0;
  266.         //inquote=0;  // effacer quote
  267.         intag_start=adr; intag_start_valid=1;
  268.         codebase[0]='\0';    // effacer Èventuel codebase
  269.         
  270.         if (opt.getmode & 1) {  // sauver html
  271.           p=strfield(adr,"</html");
  272.           if (p==0) p=strfield(adr,"<head>");
  273.           // if (p==0) p=strfield(adr,"<doctype");
  274.           if (p) {
  275.             if (strnotempty(opt.footer)) {
  276.               char tempo[1024+HTS_URLMAXSIZE*2];
  277.               char gmttime[256];
  278.               char* eol="\n";
  279.               tempo[0]='\0';
  280.               if (strchr(r.adr,'\r'))
  281.                 eol="\r\n";
  282.               time_gmt_rfc822(gmttime);
  283.               strcatbuff(tempo,eol);
  284.               sprintf(tempo+strlen(tempo),opt.footer,jump_identification(urladr),urlfil,gmttime,HTTRACK_VERSIONID,"","","","","","","");
  285.               strcatbuff(tempo,eol);
  286.               //fwrite(tempo,1,strlen(tempo),fp);
  287.               HT_ADD(tempo);
  288.             }
  289.           }
  290.         }        
  291.         
  292.         // Èliminer les <!-- (commentaires) : intag dÈvalidÈ
  293.         if (*(adr+1)=='!')
  294.           if (*(adr+2)=='-')
  295.             if (*(adr+3)=='-') {
  296.               intag=0;
  297.               incomment=1;
  298.               intag_start_valid=0;
  299.             }
  300.             
  301.       }
  302.       else if (
  303.            (*adr=='>')                        /* ending tag */
  304.         && ( (!inscript) || (inscript_tag) )  /* and in tag (or in script) */
  305.       ) {
  306.         if (inscript_tag) {
  307.           inscript_tag=inscript=0;
  308.           intag=0;
  309.           incomment=0;
  310.           intag_start_valid=0;
  311.         } else if (!incomment) {
  312.           intag=0; //inquote=0;
  313.           
  314.           // entrÈe dans du javascript?
  315.           // on parse ICI car il se peut qu'on ait eu a parser les src=.. dedans
  316.           //if (!inscript) {  // sinon on est dans un obj.write("..
  317.           if ((intag_start_valid) && 
  318.             (
  319.             check_tag(intag_start,"script")
  320.             ||
  321.             check_tag(intag_start,"style")
  322.             )
  323.             ) {
  324.             char* a=intag_start;    // <
  325.             // ** while(is_realspace(*(--a)));
  326.             if (*a=='<') {  // s˚r que c'est un tag?
  327.               inscript=1;
  328.               intag=1;     // because aprËs <script> on y est .. - pas utile
  329.               intag_start_valid=0;    // OUI car nous sommes dans du code, plus dans du "vrai" tag
  330.             }
  331.           }
  332.         } else {                               /* end of comment? */
  333.           // vÈrifier fermeture correcte
  334.           if ( (*(adr-1)=='-') && (*(adr-2)=='-') ) {
  335.             intag=0;
  336.             incomment=0;
  337.             intag_start_valid=0;
  338.           }
  339. #if GT_ENDS_COMMENT
  340.           /* wrong comment ending */
  341.           else {
  342.             /* check if correct ending does not exists
  343.                <!-- foo > example <!-- bar > is sometimes accepted by browsers
  344.                when no --> is used somewhere else.. darn those browsers are dirty
  345.             */
  346.             if (!strstr(adr,"-->")) {
  347.               intag=0;
  348.               incomment=0;
  349.               intag_start_valid=0;
  350.             }
  351.           }
  352. #endif
  353.         }
  354.         //}
  355.       }
  356.       //else if (*adr==34) {
  357.       //  inquote=(inquote?0:1);
  358.       //}
  359.       else if (intag || inscript) {    // nous sommes dans un tag/commentaire, tester si on recoit un tag
  360.         int p_type=0;
  361.         int p_nocatch=0;
  362.         int p_searchMETAURL=0;  // chercher ..URL=<url>
  363.         int add_class=0;        // ajouter .class
  364.         int add_class_dots_to_patch=0;   // number of '.' in code="x.y.z<realname>"
  365.         char* p_flush=NULL;
  366.         
  367.         
  368.         // ------------------------------------------------------------
  369.         // parsing ÈvolÈ
  370.         // ------------------------------------------------------------
  371.         if (((isalpha((unsigned char)*adr)) || (*adr=='/') || (inscript) || (inscriptgen))) {  // sinon pas la peine de tester..
  372.  
  373.  
  374.           /* caractËre de terminaison pour "miniparsing" javascript=.. ? 
  375.              (ex: <a href="javascript:()" action="foo"> ) */
  376.           if (inscript_tag) {
  377.             if (inscript_tag_lastc) {
  378.               if (*adr == inscript_tag_lastc) {
  379.                 /* sortir */
  380.                 inscript_tag=inscript=0;
  381.                 incomment=0;
  382.               }
  383.             }
  384.           }
  385.           
  386.           
  387.           // Note:
  388.           // Certaines pages ne respectent pas le html
  389.           // notamment les guillements ne sont pas fixÈs
  390.           // Nous sommes dans un tag, donc on peut faire un test plus
  391.           // large pour pouvoi prendre en compte ces particularitÈs
  392.           
  393.           // ‡ vÈrifier: ACTION, CODEBASE, VRML
  394.           
  395.           if (in_media) {
  396.             if (strcmp(in_media,"RAM")==0) { // real media
  397.               p=0;
  398.               valid_p=1;
  399.             }
  400.           } else if (ptr>0) {        /* pas premiËre page 0 (primary) */
  401.             p=0;  // saut pour le nom de fichier: adresse nom fichier=adr+p
  402.             
  403.             // ------------------------------
  404.             // dÈtection d'Ècriture JavaScript.
  405.             // osons les obj.write et les obj.href=.. ! osons!
  406.             // note: inscript==1 donc on sautera aprËs les \"
  407.             if (inscript) {
  408.               if (inscriptgen) {          // on est dÈja dans un objet gÈnÈrant..
  409.                 if (*adr==scriptgen_q) {  // fermeture des " ou '
  410.                   if (*(adr-1)!='\\') {   // non
  411.                     inscriptgen=0;        // ok parsing terminÈ
  412.                   }
  413.                 }
  414.               } else {
  415.                 char* a=NULL;
  416.                 char check_this_fking_line=0;  // parsing code javascript..
  417.                 char must_be_terminated=0;     // caractËre obligatoire de terminaison!
  418.                 int token_size;
  419.                 if (!(token_size=strfield(adr,".writeln"))) // dÈtection ...objet.write[ln]("code html")...
  420.                   token_size=strfield(adr,".write");
  421.                 if (token_size) {
  422.                   a=adr+token_size;
  423.                   while(is_realspace(*a)) a++; // sauter espaces
  424.                   if (*a=='(') {  // dÈbut parenthËse
  425.                     check_this_fking_line=2;  // ‡ parser!
  426.                     must_be_terminated=')';
  427.                     a++;  // sauter (
  428.                   }
  429.                 }
  430.                 // euhh ??? ???
  431.                 /* else if (strfield(adr,".href")) {  // dÈtection ...objet.href="...
  432.                 a=adr+5;
  433.                 while(is_realspace(*a)) a++; // sauter espaces
  434.                 if (*a=='=') {  // ohh un Ègal
  435.                 check_this_fking_line=1;  // ‡ noter!
  436.                 must_be_terminated=';';   // et si t'as oubliÈ le ; tu sais pas coder
  437.                 a++;   // sauter =
  438.                 }
  439.                 
  440.                 }*/
  441.                 
  442.                 // on a un truc du genre instruction"code gÈnÈrÈ" dont on parse le code
  443.                 if (check_this_fking_line) {
  444.                   while(is_realspace(*a)) a++;
  445.                   if ((*a=='\'') || (*a=='"')) {  // dÈpart de '' ou ""
  446.                     char *b;
  447.                     int ex=0;
  448.                     scriptgen_q=*a;    // quote
  449.                     b=a+1;      // dÈpart de la chaÓne
  450.                     // vÈrifier forme ("code") et pas ("code"+var), ingÈrable
  451.                     do {
  452.                       a++;  // caractËre suivant
  453.                       if (*a==scriptgen_q) if (*(a-1)!='\\')  // quote non slash
  454.                         ex=1;            // sortie
  455.                       if ((*a==10) || (*a==13))
  456.                         ex=1;
  457.                     } while(!ex);
  458.                     if (*a==scriptgen_q) {  // fin du quote
  459.                       a++;
  460.                       while(is_realspace(*a)) a++;
  461.                       if (*a==must_be_terminated) {  // parenthËse fermante: ("..")
  462.                         
  463.                         // bon, on doit parser une ligne javascript
  464.                         // 1) si check.. ==1 alors c'est un nom de fichier direct, donc
  465.                         // on fixe p sur le saut nÈcessaire pour atteindre le nom du fichier
  466.                         // et le moteur se dÈbrouillera ensuite tout seul comme un grand
  467.                         // 2) si check==2 c'est un peu plus tordu car l‡ on gÈnÈre du
  468.                         // code html au sein de code javascript au sein de code html
  469.                         // dans ce cas on doit fixer un flag ‡ un puis ensuite dans la boucle
  470.                         // on devra parser les instructions standard comme <a href etc
  471.                         // NOTE: le code javascript autogÈnÈrÈ n'est pas pris en compte!!
  472.                         // (et ne marche pas dans 50% des cas de toute facon!)
  473.                         if (check_this_fking_line==1) {
  474.                           p=(int) (b - adr);    // calculer saut!
  475.                         } else {
  476.                           inscriptgen=1;        // SCRIPTGEN actif
  477.                           adr=b;                // jump
  478.                         }
  479.                         
  480.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  481.                           char str[512];
  482.                           str[0]='\0';
  483.                           strncatbuff(str,b,minimum((int) (a - b + 1), 32));
  484.                           fspc(opt.log,"debug"); fprintf(opt.log,"active code (%s) detected in javascript: %s"LF,(check_this_fking_line==2)?"parse":"pickup",str); test_flush;
  485.                         }
  486.                       }
  487.                       
  488.                     }
  489.                     
  490.                   }
  491.                   
  492.                   
  493.                 }
  494.               }
  495.             }
  496.             // fin detection code gÈnÈrant javascript vers html
  497.             // ------------------------------
  498.             
  499.             
  500.             // analyse proprement dite, A HREF=.. etc..
  501.             if (!p) {
  502.               // si dans un tag, et pas dans un script - sauf si on analyse un obj.write("..
  503.               if ((intag && (!inscript)) || inscriptgen) {
  504.                 if ( (*(adr-1)=='<') || (is_space(*(adr-1))) ) {   // <tag < tag etc
  505.                   // <A HREF=.. pour les liens HTML
  506.                   p=rech_tageq(adr,"href");
  507.                   if (p) {    // href.. tester si c'est une bas href!
  508.                     if ((intag_start_valid) && check_tag(intag_start,"base")) {  // oui!
  509.                       // ** note: base href et codebase ne font pas bon mÈnage..
  510.                       p_type=2;    // c'est un chemin
  511.                     }
  512.                   }
  513.                   
  514.                   /* Tags supplÈmentaires ‡ vÈrifier (<img src=..> etc) */
  515.                   if (p==0) {
  516.                     int i=0;
  517.                     while( (p==0) && (strnotempty(hts_detect[i])) ) {
  518.                       p=rech_tageq(adr,hts_detect[i]);
  519.                       i++;
  520.                     }
  521.                   }
  522.  
  523.                   /* Tags supplÈmentaires en dÈbut ‡ vÈrifier (<object .. hotspot1=..> etc) */
  524.                   if (p==0) {
  525.                     int i=0;
  526.                     while( (p==0) && (strnotempty(hts_detectbeg[i])) ) {
  527.                       p=rech_tageqbegdigits(adr,hts_detectbeg[i]);
  528.                       i++;
  529.                     }
  530.                   }
  531.                   
  532.                   /* Tags supplÈmentaires ‡ vÈrifier : URL=.. */
  533.                   if (p==0) {
  534.                     int i=0;
  535.                     while( (p==0) && (strnotempty(hts_detectURL[i])) ) {
  536.                       p=rech_tageq(adr,hts_detectURL[i]);
  537.                       i++;
  538.                     }
  539.                     if (p)
  540.                       p_searchMETAURL=1;
  541.                   }
  542.                   
  543.                   /* Tags supplÈmentaires ‡ vÈrifier, mais ‡ ne pas capturer */
  544.                   if (p==0) {
  545.                     int i=0;
  546.                     while( (p==0) && (strnotempty(hts_detectandleave[i])) ) {
  547.                       p=rech_tageq(adr,hts_detectandleave[i]);
  548.                       i++;
  549.                     }
  550.                     if (p)
  551.                       p_nocatch=1;      /* ne pas rechercher */
  552.                   }
  553.                   
  554.                   /* EvÈnements */
  555.                   if (p==0) {
  556.                     int i=0;
  557.                     /* dÈtection onLoad etc */
  558.                     while( (p==0) && (strnotempty(hts_detect_js[i])) ) {
  559.                       p=rech_tageq(adr,hts_detect_js[i]);
  560.                       i++;
  561.                     }
  562.                     /* non dÈtectÈ - dÈtecter Ègalement les onXxxxx= */
  563.                     if (p==0) {
  564.                       if ( (*adr=='o') && (*(adr+1)=='n') && isUpperLetter(*(adr+2)) ) {
  565.                         p=0;
  566.                         while(isalpha((unsigned char)adr[p]) && (p<64) ) p++;
  567.                         if (p<64) {
  568.                           while(is_space(adr[p])) p++;
  569.                           if (adr[p]=='=')
  570.                             p++;
  571.                           else p=0;
  572.                         } else p=0;
  573.                       }
  574.                     }
  575.                     /* OK, ÈvÈnement repÈrÈ */
  576.                     if (p) {
  577.                       inscript_tag_lastc=*(adr+p);     /* ‡ attendre ‡ la fin */
  578.                       adr+=p;     /* saut */
  579.                                   /*
  580.                                   On est dÈsormais dans du code javascript
  581.                       */
  582.                       inscript_tag=inscript=1;
  583.                     }
  584.                     p=0;        /* quoi qu'il arrive, ne rien dÈmarrer ici */
  585.                   }
  586.                   
  587.                   // <APPLET CODE=.. pour les applet java.. [CODEBASE (chemin..) ‡ faire]
  588.                   if (p==0) {
  589.                     p=rech_tageq(adr,"code");
  590.                     if (p) {
  591.                       if ((intag_start_valid) && check_tag(intag_start,"applet")) {  // dans un <applet !
  592.                         p_type=-1;  // juste le nom de fichier+dossier, Ècire avant codebase 
  593.                         add_class=1;   // ajouter .class au besoin                         
  594.                         
  595.                         // vÈrifier qu'il n'y a pas de codebase APRES
  596.                         // sinon on swappe les deux.
  597.                         // pas trËs propre mais c'est ce qu'il y a de plus simple ‡ faire!!
  598.                         
  599.                         {
  600.                           char *a;
  601.                           a=adr;
  602.                           while((*a) && (*a!='>') && (!rech_tageq(a,"codebase"))) a++;
  603.                           if (rech_tageq(a,"codebase")) {  // banzai! codebase=
  604.                             char* b;
  605.                             b=strchr(a,'>');
  606.                             if (b) {
  607.                               if (((int) (b - adr)) < 1000) {    // au total < 1Ko
  608.                                 char tempo[HTS_URLMAXSIZE*2];
  609.                                 tempo[0]='\0';
  610.                                 strncatbuff(tempo,a,(int) (b - a) );
  611.                                 strcatbuff( tempo," ");
  612.                                 strncatbuff(tempo,adr,(int) (a - adr - 1));
  613.                                 // Èventuellement remplire par des espaces pour avoir juste la taille
  614.                                 while((int) strlen(tempo)<((int) (b - adr)))
  615.                                   strcatbuff(tempo," ");
  616.                                 // pas d'erreur?
  617.                                 if ((int) strlen(tempo) == ((int) (b - adr) )) {
  618.                                   strncpy(adr,tempo,strlen(tempo));   // PAS d'octet nul ‡ la fin!
  619.                                   p=0;    // DEVALIDER!!
  620.                                   p_type=0;
  621.                                   add_class=0;
  622.                                 }
  623.                               }
  624.                             }
  625.                           }
  626.                         }
  627.                         
  628.                       }
  629.                     }
  630.                   }
  631.                   
  632.                   // liens ‡ patcher mais pas ‡ charger (ex: codebase)
  633.                   if (p==0) {  // note: si non chargÈ (ex: ignorer .class) patchÈ tout de mÍme
  634.                     p=rech_tageq(adr,"codebase");
  635.                     if (p) {
  636.                       if ((intag_start_valid) && check_tag(intag_start,"applet")) {  // dans un <applet !
  637.                         p_type=-2;
  638.                       } else p=-1;   // ne plus chercher
  639.                     }
  640.                   }
  641.                   
  642.                   
  643.                   // Meta tags pour robots
  644.                   if (p==0) {
  645.                     if (opt.robots) {
  646.                       if ((intag_start_valid) && check_tag(intag_start,"meta")) {
  647.                         if (rech_tageq(adr,"name")) {    // name=robots.txt
  648.                           char tempo[1100];
  649.                           char* a;
  650.                           tempo[0]='\0';
  651.                           a=strchr(adr,'>');
  652. #if DEBUG_ROBOTS
  653.                           printf("robots.txt meta tag detected\n");
  654. #endif
  655.                           if (a) {
  656.                             if (((int) (a - adr)) < 999 ) {
  657.                               strncatbuff(tempo,adr,(int) (a - adr));
  658.                               if (strstrcase(tempo,"content")) {
  659.                                 if (strstrcase(tempo,"robots")) {
  660.                                   if (strstrcase(tempo,"nofollow")) {
  661. #if DEBUG_ROBOTS
  662.                                     printf("robots.txt meta tag: nofollow in %s%s\n",urladr,urlfil);
  663. #endif
  664.                                     nofollow=1;       // NE PLUS suivre liens dans cette page
  665.                                     if (opt.errlog) {
  666.                                       fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s%s not scanned (follow robots meta tag)"LF,urladr,urlfil);
  667.                                       test_flush;
  668.                                     }
  669.                                   }
  670.                                 }
  671.                               }
  672.                             }
  673.                           }
  674.                         }
  675.                       }
  676.                     }
  677.                   }
  678.                   
  679.                   // entrÈe dans une applet javascript
  680.                   /*if (!inscript) {  // sinon on est dans un obj.write("..
  681.                   if (p==0)
  682.                   if (rech_sampletag(adr,"script"))
  683.                   if (check_tag(intag_start,"script")) {
  684.                   inscript=1;
  685.                   }
  686.                         }*/
  687.                   
  688.                   // Ici on procËde ‡ une analyse du code javascript pour tenter de rÈcupÈrer
  689.                   // certains fichiers Èvidents.
  690.                   // C'est devenu obligatoire vu le nombre de pages qui intËgrent
  691.                   // des images rÈactives par exemple
  692.                 }
  693.               } else if (inscript) {
  694.                 if (
  695.                   (
  696.                   (strfield(adr,"/script"))
  697.                   ||
  698.                   (strfield(adr,"/style"))
  699.                   )
  700.                   ) {
  701.                   char* a=adr;
  702.                   //while(is_realspace(*(--a)));
  703.                   while( is_realspace(*a) ) a--;
  704.                   a--;
  705.                   if (*a=='<') {  // s˚r que c'est un tag?
  706.                     inscript=0;
  707.                   }
  708.                 } else {
  709.                   /*
  710.                   Script Analyzing - different types supported:
  711.                     foo="url"
  712.                     foo("url") or foo(url)
  713.                     foo "url"
  714.                   */
  715.                   int nc;
  716.                   char  expected     = '=';          // caractËre attendu aprËs
  717.                   char* expected_end = ";";
  718.                   int can_avoid_quotes=0;
  719.                   char quotes_replacement='\0';
  720.                   if (inscript_tag)
  721.                     expected_end=";\"\'";            // voir a href="javascript:doc.location='foo'"
  722.                   nc = strfield(adr,".src");  // nom.src="image";
  723.                   if (!nc) nc = strfield(adr,".location");  // document.location="doc"
  724.                   if (!nc) nc = strfield(adr,".href");  // document.location="doc"
  725.                   if (!nc) if ( (nc = strfield(adr,".open")) ) { // window.open("doc",..
  726.                     expected='(';    // parenthËse
  727.                     expected_end="),";  // fin: virgule ou parenthËse
  728.                   }
  729.                   if (!nc) if ( (nc = strfield(adr,".replace")) ) { // window.replace("url")
  730.                     expected='(';    // parenthËse
  731.                     expected_end=")";  // fin: parenthËse
  732.                   }
  733.                   if (!nc) if ( (nc = strfield(adr,".link")) ) { // window.link("url")
  734.                     expected='(';    // parenthËse
  735.                     expected_end=")";  // fin: parenthËse
  736.                   }
  737.                   if (!nc) if ( (nc = strfield(adr,"url")) ) { // url(url)
  738.                     expected='(';    // parenthËse
  739.                     expected_end=")";  // fin: parenthËse
  740.                     can_avoid_quotes=1;
  741.                     quotes_replacement=')';
  742.                   }
  743.                   if (!nc) if ( (nc = strfield(adr,"import")) ) { // import "url"
  744.                     if (is_space(*(adr+nc))) {
  745.                       expected=0;    // no char expected
  746.                     } else
  747.                       nc=0;
  748.                   }
  749.                   if (nc) {
  750.                     char *a;
  751.                     a=adr+nc;
  752.                     while(is_realspace(*a)) a++;
  753.                     if ((*a == expected) || (!expected)) {
  754.                       if (expected)
  755.                         a++;
  756.                       while(is_realspace(*a)) a++;
  757.                       if ((*a==34) || (*a=='\'') || (can_avoid_quotes)) {
  758.                         char *b,*c;
  759.                         int ndelim=1;
  760.                         if ((*a==34) || (*a=='\''))
  761.                           a++;
  762.                         else
  763.                           ndelim=0;
  764.                         b=a;
  765.                         if (ndelim) {
  766.                           while((*b!=34) && (*b!='\'') && (*b!='\0')) b++;
  767.                         }
  768.                         else {
  769.                           while((*b != quotes_replacement) && (*b!='\0')) b++;
  770.                         }
  771.                         c=b--; c+=ndelim;
  772.                         while(*c==' ') c++;
  773.                         if ((strchr(expected_end,*c)) || (*c=='\n') || (*c=='\r')) {
  774.                           c-=(ndelim+1);
  775.                           if ((int) (c - a + 1)) {
  776.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  777.                               char str[512];
  778.                               str[0]='\0';
  779.                               strncatbuff(str,a,minimum((int) (c - a + 1),32));
  780.                               fspc(opt.log,"debug"); fprintf(opt.log,"link detected in javascript: %s"LF,str); test_flush;
  781.                             }
  782.                             p=(int) (a - adr);    // p non nul: TRAITER CHAINE COMME FICHIER
  783.                             if (can_avoid_quotes) {
  784.                               ending_p=quotes_replacement;
  785.                             }
  786.                           }
  787.                         }
  788.                         
  789.                         
  790.                       }
  791.                     }
  792.                   }
  793.                   
  794.                 }
  795.               }
  796.             }
  797.             
  798.           } else {      // ptr == 0
  799.             //p=rech_tageq(adr,"primary");    // lien primaire, yeah
  800.             p=0;          // No stupid tag anymore, raw link
  801.             valid_p=1;    // Valid even if p==0
  802.             while ((adr[p] == '\r') || (adr[p] == '\n'))
  803.               p++;
  804.             //can_avoid_quotes=1;
  805.             ending_p='\r';
  806.           }       
  807.           
  808.         } else if (isspace((unsigned char)*adr)) {
  809.           intag_startattr=adr+1;        // attribute in tag (for dirty parsing)
  810.         }
  811.           
  812.           
  813.           // ------------------------------------------------------------
  814.           // dernier recours - parsing "sale" : dÈtection systÈmatique des .gif, etc.
  815.           // risque: gÈnÈrer de faux fichiers parazites
  816.           // fix: ne parse plus dans les commentaires
  817.           // ------------------------------------------------------------
  818.           if ( (opt.parseall) && (ptr>0) && (!in_media) ) {           // option parsing "brut"
  819.             int incomment_justquit=0;
  820.             if (!is_realspace(*adr)) {
  821.               int noparse=0;
  822.  
  823.               // Gestion des /* */
  824.               if (inscript) {
  825.                 if (parseall_incomment) {
  826.                   if ((*adr=='/') && (*(adr-1)=='*'))
  827.                     parseall_incomment=0;
  828.                   incomment_justquit=1;       // ne pas noter dernier caractËre
  829.                 } else {
  830.                   if ((*adr=='/') && (*(adr+1)=='*'))
  831.                     parseall_incomment=1;
  832.                 }
  833.               } else
  834.                 parseall_incomment=0;
  835.  
  836.               /* vÈrifier que l'on est pas dans un <!-- --> pur */
  837.               if ( (!intag) && (incomment) && (!inscript))
  838.                 noparse=1;        /* commentaire */
  839.  
  840.               // recherche d'URLs
  841.               if ((!parseall_incomment) && (!noparse)) {
  842.                 if (!p) {                   // non dÈja trouvÈ
  843.                   if (adr != r.adr) {     // >1 caractËre
  844.                     // scanner les chaines
  845.                     if ((*adr == '\"') || (*adr=='\'')) {         // "xx.gif" 'xx.gif'
  846.                       if (strchr("=(,",parseall_lastc)) {    // exemple: a="img.gif..
  847.                         char *a=adr;
  848.                         char stop=*adr;  // " ou '
  849.                         int count=0;
  850.                         
  851.                         // sauter caractËres
  852.                         a++;
  853.                         // copier
  854.                         while((*a) && (*a!='\'') && (*a!='\"') && (count<HTS_URLMAXSIZE)) { count++; a++; }
  855.                         
  856.                         // ok chaine terminÈe par " ou '
  857.                         if ((*a == stop) && (count<HTS_URLMAXSIZE) && (count>0)) {
  858.                           char c;
  859.                           char* aend;
  860.                           //
  861.                           aend=a;     // sauver dÈbut
  862.                           a++;
  863.                           while(is_taborspace(*a)) a++;
  864.                           c=*a;
  865.                           if (strchr("),;>/+\r\n",c)) {     // exemple: ..img.gif";
  866.                             // le / est pour funct("img.gif" /* URL */);
  867.                             char tempo[HTS_URLMAXSIZE*2];
  868.                             char type[256];
  869.                             int url_ok=0;      // url valide?
  870.                             tempo[0]='\0'; type[0]='\0';
  871.                             //
  872.                             strncatbuff(tempo,adr+1,count);
  873.                             //
  874.                             if ((!strchr(tempo,' ')) || inscript) {   // espace dedans: mÈfiance! (sauf dans code javascript)
  875.                               int invalid_url=0;
  876.  
  877.                               // escape                              
  878.                               unescape_amp(tempo);
  879.  
  880.                               // Couper au # ou ? Èventuel
  881.                               {
  882.                                 char* a=strchr(tempo,'#');
  883.                                 if (a)
  884.                                   *a='\0';
  885.                                 a=strchr(tempo,'?');
  886.                                 if (a)
  887.                                   *a='\0';
  888.                               }
  889.  
  890.                               // vÈrifier qu'il n'y a pas de caractËres spÈciaux
  891.                               if (!strnotempty(tempo))
  892.                                 invalid_url=1;
  893.                               else if (strchr(tempo,'*')
  894.                                 || strchr(tempo,'<')
  895.                                 || strchr(tempo,'>'))
  896.                                 invalid_url=1;
  897.                               
  898.                               /* non invalide? */
  899.                               if (!invalid_url) {
  900.                                 // Un plus ‡ la fin? Alors ne pas prendre sauf si extension ("/toto.html#"+tag)
  901.                                 if (c!='+') {    // PAS de plus ‡ la fin
  902.                                   char* a;
  903.                                   // "Comparisons of scheme names MUST be case-insensitive" (RFC2616)                                  
  904.                                   //if ((strncmp(tempo,"http://",7)==0) || (strncmp(tempo,"ftp://",6)==0))  // ok pas de problËme
  905.                                   if (
  906.                                        (strfield(tempo,"http:")) 
  907.                                     || (strfield(tempo,"ftp:"))
  908. #if HTS_USEOPENSSL
  909.                                     || (
  910.                                     SSL_is_available &&
  911.                                     (strfield(tempo,"https:"))
  912.                                     )
  913. #endif
  914.                                     )  // ok pas de problËme
  915.                                     url_ok=1;
  916.                                   else if (tempo[strlen(tempo)-1]=='/') {        // un slash: ok..
  917.                                     if (inscript)   // sinon si pas javascript, mÈfiance (rÈpertoire style base?)
  918.                                       url_ok=1;
  919.                                   } else if ((a=strchr(tempo,'/'))) {        // un slash: ok..
  920.                                     if (inscript) {    // sinon si pas javascript, mÈfiance (style "text/css")
  921.                                       if (strchr(a+1,'/'))  // un seul / : abandon (STYLE type='text/css')
  922.                                         url_ok=1;
  923.                                     }
  924.                                   }
  925.                                 }
  926.                                 // Prendre si extension reconnue
  927.                                 if (!url_ok) {
  928.                                   get_httptype(type,tempo,0);
  929.                                   if (strnotempty(type))     // type reconnu!
  930.                                     url_ok=1;
  931.                                   else if (is_dyntype(get_ext(tempo)))  // reconnu php,cgi,asp..
  932.                                     url_ok=1;
  933.                                   // MAIS pas les foobar@aol.com !!
  934.                                   if (strchr(tempo,'@'))
  935.                                     url_ok=0;
  936.                                 }
  937.                                 //
  938.                                 // Ok, cela pourrait Ítre une URL
  939.                                 if (url_ok) {
  940.                                   
  941.                                   // Check if not fodbidden tag (id,name..)
  942.                                   if (intag_start_valid) {
  943.                                     if (intag_start)
  944.                                       if (intag_startattr)
  945.                                         if (intag)
  946.                                           if (!inscript)
  947.                                             if (!incomment) {
  948.                                               int i=0,nop=0;
  949.                                               while( (nop==0) && (strnotempty(hts_nodetect[i])) ) {
  950.                                                 nop=rech_tageq(intag_startattr,hts_nodetect[i]);
  951.                                                 i++;
  952.                                               }
  953.                                               // Forbidden tag
  954.                                               if (nop) {
  955.                                                 url_ok=0;
  956.                                                 if ((opt.debug>1) && (opt.log!=NULL)) {
  957.                                                   fspc(opt.log,"debug"); fprintf(opt.log,"dirty parsing: bad tag avoided: %s"LF,hts_nodetect[i-1]); test_flush;
  958.                                                 }
  959.                                               }
  960.                                             }
  961.                                   }
  962.                                   
  963.                                   
  964.                                   // Accepter URL, on la traitera comme une URL normale!!
  965.                                   if (url_ok)
  966.                                     p=1;
  967.  
  968.                                 }
  969.                               }
  970.                             }
  971.                           }
  972.                         }
  973.                       }
  974.                     }
  975.                   }
  976.                 }  // p == 0
  977.                 
  978.                 // plus dans un commentaire
  979.                 if (!incomment_justquit)
  980.                   parseall_lastc=*adr;             // caractËre avant le prochain
  981.                 
  982.               } // not in comment
  983.               
  984.             }  // if realspace
  985.           }  // if parseall
  986.           
  987.           
  988.           // ------------------------------------------------------------
  989.           // p!=0 : on a repÈrÈ un Èventuel lien
  990.           // ------------------------------------------------------------
  991.           //
  992.           if ((p>0) || (valid_p)) {    // on a repÈrÈ un lien
  993.             //int lien_valide=0;
  994.             char* eadr=NULL;          /* fin de l'URL */
  995.             char* quote_adr=NULL;     /* adresse du ? dans l'adresse */
  996.             int ok=1;
  997.             char quote='\0';
  998.             
  999.             // si nofollow ou un stop a ÈtÈ dÈclenchÈ, rÈÈcrire tous les liens en externe
  1000.             if ((nofollow) || (opt.state.stop))
  1001.               p_nocatch=1;
  1002.  
  1003.             // Ècrire codebase avant, flusher avant code
  1004.             if ((p_type==-1) || (p_type==-2)) {
  1005.               if ((opt.getmode & 1) && (ptr>0)) {
  1006.                 HT_ADD_ADR;    // refresh
  1007.               }
  1008.               lastsaved=adr;    // dernier Ècrit+1
  1009.             }
  1010.             
  1011.             // sauter espaces
  1012.             adr+=p;
  1013.             while((is_space(*adr)) && (quote=='\0')) {
  1014.               if (!quote)
  1015.                 if ((*adr=='\"') || (*adr=='\''))
  1016.                   quote=*adr;                     // on doit attendre cela ‡ la fin
  1017.                                                   // puis quitter
  1018.                 adr++;    // sauter les espaces, "" et cie
  1019.             }
  1020.  
  1021.             /* Stop at \n (LF) if primary links*/
  1022.             if (ptr == 0)
  1023.               quote='\n';
  1024.             /* s'arrÍter que ce soit un ' ou un " : pour document.write('<img src="foo'+a); par exemple! */
  1025.             else if (inscript)
  1026.               quote='\0';
  1027.             
  1028.             // sauter Èventuel \" ou \' javascript
  1029.             if (inscript) {    // on est dans un obj.write("..
  1030.               if (*adr=='\\') {
  1031.                 if ((*(adr+1)=='\'') || (*(adr+1)=='"')) {  // \" ou \'
  1032.                   adr+=2;    // sauter
  1033.                 }
  1034.               }
  1035.             }
  1036.             
  1037.             // sauter content="1;URL=http://..
  1038.             if (p_searchMETAURL) {
  1039.               int l=0;
  1040.               while(
  1041.                 (adr + l + 4 < r.adr + r.size)
  1042.                 && (!strfield(adr+l,"URL=")) 
  1043.                 && (l<128) ) l++;
  1044.               if (!strfield(adr+l,"URL="))
  1045.                 ok=-1;
  1046.               else
  1047.                 adr+=(l+4);
  1048.             }
  1049.  
  1050.             /* Èviter les javascript:document.location=.. : les parser, plutÙt */
  1051.             if (ok!=-1) {
  1052.               if (strfield(adr,"javascript:")) {
  1053.                 ok=-1;
  1054.                 /*
  1055.                 On est dÈsormais dans du code javascript
  1056.                 */
  1057.                 inscript_tag=inscript=1;
  1058.                 inscript_tag_lastc=quote;     /* ‡ attendre ‡ la fin */
  1059.               }
  1060.             }
  1061.             
  1062.             if (p_type==1) {
  1063.               if (*adr=='#') {
  1064.                 adr++;           // sauter # pour usemap etc
  1065.               }
  1066.             }
  1067.             eadr=adr;
  1068.             
  1069.             // ne pas flusher aprËs code si on doit Ècrire le codebase avant!
  1070.             if ((p_type!=-1) && (p_type!=2) && (p_type!=-2)) {
  1071.               if ((opt.getmode & 1) && (ptr>0)) {
  1072.                 HT_ADD_ADR;    // refresh
  1073.               }
  1074.               lastsaved=adr;    // dernier Ècrit+1
  1075.               // aprËs on Ècrira soit les donnÈes initiales,
  1076.               // soir une URL/lien modifiÈ!
  1077.             } else if (p_type==-1) p_flush=adr;    // flusher jusqu'‡ adr ensuite
  1078.             
  1079.             if (ok!=-1) {    // continuer
  1080.               // dÈcouper le lien
  1081.               do {
  1082.                 if ((* (unsigned char*) eadr)<32) {   // caractËre de contrÙle (ou \0)
  1083.                   if (!is_space(*eadr))
  1084.                     ok=0; 
  1085.                 }
  1086.                 if ( ( ((int) (eadr - adr)) ) > HTS_URLMAXSIZE)  // ** trop long, >HTS_URLMAXSIZE caractËres (on prÈvoit HTS_URLMAXSIZE autres pour path)
  1087.                   ok=-1;    // ne pas traiter ce lien
  1088.                 
  1089.                 if (ok > 0) {
  1090.                   //if (*eadr!=' ') {  
  1091.                   if (is_space(*eadr)) {   // guillemets,CR, etc
  1092.                     if ((!quote) || (*eadr==quote))     // si pas d'attente de quote spÈciale ou si quote atteinte
  1093.                       ok=0; 
  1094.                   } else if (ending_p && (*eadr==ending_p))
  1095.                     ok=0;
  1096.                   else {
  1097.                     switch(*eadr) {
  1098.                     case '>': 
  1099.                       if (!quote) {
  1100.                         if (!inscript) {
  1101.                           intag=0;    // PLUS dans un tag!
  1102.                           intag_start_valid=0;
  1103.                         }
  1104.                         ok=0;
  1105.                       }
  1106.                       break;
  1107.                       /*case '<':*/ 
  1108.                     case '#': 
  1109.                       if (*(eadr-1) != '&')       // (
  1110.                         ok=0; 
  1111.                       break;
  1112.                       // case '?': non!
  1113.                     case '\\': if (inscript) ok=0; break;     // \" ou \' point d'arrÍt
  1114.                     case '?': quote_adr=adr; break;           // noter position query
  1115.                     }
  1116.                   }
  1117.                   //}
  1118.                 } 
  1119.                 eadr++;
  1120.               } while(ok==1);     
  1121.               
  1122.               // Empty link detected
  1123.               if ( (((int) (eadr - adr))) <= 1) {       // link empty
  1124.                 ok=-1;        // No
  1125.                 if (*adr != '#') {        // Not empty+unique #
  1126.                   if ( (((int) (eadr - adr)) == 1)) {       // 1=link empty with delim (end_adr-start_adr)
  1127.                     if (quote) {
  1128.                       if ((opt.getmode & 1) && (ptr>0)) { 
  1129.                         HT_ADD("#");        // We add this for a <href="">
  1130.                       }
  1131.                     }
  1132.                   }
  1133.                 }
  1134.               }
  1135.               
  1136.             }
  1137.             
  1138.             if (ok==0) {    // tester un lien
  1139.               char lien[HTS_URLMAXSIZE*2];
  1140.               int meme_adresse=0;      // 0 par dÈfaut pour primary
  1141.               //char *copie_de_adr=adr;
  1142.               //char* p;
  1143.               
  1144.               // construire lien (dÈcoupage)
  1145.               if ( (((int) (eadr -  adr))-1) < HTS_URLMAXSIZE  ) {    // pas trop long?
  1146.                 strncpy(lien,adr,((int) (eadr - adr))-1);
  1147.                 *(lien+  (((int) (eadr -  adr)))-1  )='\0';
  1148.                 //printf("link: %s\n",lien);          
  1149.                 // supprimer les espaces
  1150.                 while((lien[strlen(lien)-1]==' ') && (strnotempty(lien))) lien[strlen(lien)-1]='\0';
  1151.  
  1152.                 
  1153. #if HTS_STRIP_DOUBLE_SLASH
  1154.                 // supprimer les // en / (sauf pour http://)
  1155.                 {
  1156.                   char *a,*p,*q;
  1157.                   int done=0;
  1158.                   a=strchr(lien,':');    // http://
  1159.                   if (a) {
  1160.                     a++;
  1161.                     while(*a=='/') a++;    // position aprËs http://
  1162.                   } else {
  1163.                     a=lien;                // dÈbut
  1164.                     while(*a=='/') a++;    // position aprËs http://
  1165.                   }
  1166.                   q=strchr(a,'?');     // ne pas traiter aprËs '?'
  1167.                   if (!q)
  1168.                     q=a+strlen(a)-1;
  1169.                   while(( p=strstr(a,"//")) && (!done) ) {    // remplacer // par /
  1170.                     if ((int) p>(int) q) {   // aprËs le ? (toto.cgi?param=1//2.3)
  1171.                       done=1;    // stopper
  1172.                     } else {
  1173.                       char tempo[HTS_URLMAXSIZE*2];
  1174.                       tempo[0]='\0';
  1175.                       strncatbuff(tempo,a,(int) p - (int) a);
  1176.                       strcatbuff (tempo,p+1);
  1177.                       strcpybuff(a,tempo);    // recopier
  1178.                     }
  1179.                   }
  1180.                 }
  1181. #endif
  1182.  
  1183.               } else
  1184.                 lien[0]='\0';    // erreur
  1185.               
  1186.               // ------------------------------------------------------
  1187.               // Lien repÈrÈ et extrait
  1188.               if (strnotempty(lien)>0) {           // construction du lien
  1189.                 char adr[HTS_URLMAXSIZE*2],fil[HTS_URLMAXSIZE*2];          // ATTENTION adr cache le "vrai" adr
  1190.                 int forbidden_url=-1;              // lien non interdit (mais non autorisÈ..)
  1191.                 int just_test_it=0;                // mode de test des liens
  1192.                 int set_prio_to=0;                 // pour capture de page isolÈe
  1193.                 int import_done=0;                 // lien importÈ (ne pas scanner ensuite *‡ priori*)
  1194.                 //
  1195.                 adr[0]='\0'; fil[0]='\0';
  1196.                 //
  1197.                 // 0: autorisÈ
  1198.                 // 1: interdit (patcher tout de mÍme adresse)
  1199.                 
  1200.                 if ((opt.debug>1) && (opt.log!=NULL)) {
  1201.                   fspc(opt.log,"debug"); fprintf(opt.log,"link detected in html: %s"LF,lien); test_flush;
  1202.                 }
  1203.  
  1204.                 // external check
  1205. #if HTS_ANALYSTE
  1206.                 if (!hts_htmlcheck_linkdetected(lien)) {
  1207.                   error=1;    // erreur
  1208.                   if (opt.errlog) {
  1209.                     fspc(opt.errlog,"error"); fprintf(opt.errlog,"Link %s refused by external wrapper"LF,lien);
  1210.                     test_flush;
  1211.                   }
  1212.                 }
  1213. #endif
  1214.                 
  1215.                 // purger espaces de dÈbut et fin, CR,LF rÈsiduels
  1216.                 // (IMG SRC="foo.<\n>gif")
  1217.                 {
  1218.                   char* a;
  1219.                   while (is_realspace(lien[0])) {
  1220.                     char tempo[HTS_URLMAXSIZE*2];
  1221.                     tempo[0]='\0';
  1222.                     strcpybuff(tempo,lien+1);
  1223.                     strcpybuff(lien,tempo);
  1224.                   }
  1225.                   while(strnotempty(lien)
  1226.                         && (is_realspace(lien[max(0,(int)(strlen(lien))-1)])) ) {
  1227.                     lien[strlen(lien)-1]='\0';
  1228.                   } 
  1229.                   while ((a=strchr(lien,'\n'))) {
  1230.                     char tempo[HTS_URLMAXSIZE*2];
  1231.                     tempo[0]='\0';
  1232.                     strncatbuff(tempo,lien,(int) (a - lien));
  1233.                     strcatbuff(tempo,a+1);
  1234.                     strcpybuff(lien,tempo);
  1235.                   }
  1236.                   while ((a=strchr(lien,'\r'))) {
  1237.                     char tempo[HTS_URLMAXSIZE*2];
  1238.                     tempo[0]='\0';
  1239.                     strncatbuff(tempo,lien,(int) (a - lien));
  1240.                     strcatbuff(tempo,a+1);
  1241.                     strcpybuff(lien,tempo);
  1242.                   }
  1243.                 }
  1244.                 
  1245.                 /* Unescape/escape %20 and other   */
  1246.                 {
  1247.                   char query[HTS_URLMAXSIZE*2];
  1248.                   char* a=strchr(lien,'?');
  1249.                   if (a) {
  1250.                     strcpybuff(query,a);
  1251.                     *a='\0';
  1252.                   } else
  1253.                     query[0]='\0';
  1254.                   // conversion & -> & et autres joyeusetÈs
  1255.                   unescape_amp(lien);
  1256.                   unescape_amp(query);
  1257.                   // dÈcoder l'inutile (%2E par exemple) et coder espaces
  1258.                   // XXXXXXXXXXXXXXXXX strcpybuff(lien,unescape_http(lien));
  1259.                   strcpybuff(lien,unescape_http_unharm(lien, (no_esc_utf)?0:1));
  1260.                   escape_spc_url(lien);
  1261.                   strcatbuff(lien,query);     /* restore */
  1262.                 }
  1263.                 
  1264.                 // convertir les Èventuels \ en des / pour Èviter des problËmes de reconnaissance!
  1265.                 {
  1266.                   char* a=jump_identification(lien);
  1267.                   while( (a=strchr(a,'\\')) ) *a='/';
  1268.                 }
  1269.                 
  1270.                 // supprimer le(s) ./
  1271.                 while ((lien[0]=='.') && (lien[1]=='/')) {
  1272.                   char tempo[HTS_URLMAXSIZE*2];
  1273.                   strcpybuff(tempo,lien+2);
  1274.                   strcpybuff(lien,tempo);
  1275.                 }
  1276.                 if (strnotempty(lien)==0)  // sauf si plus de nom de fichier
  1277.                   strcpybuff(lien,"./");
  1278.                 
  1279.                 // vÈrifie les /~machin -> /~machin/
  1280.                 // supposition dangereuse?
  1281.                 // OUI!!
  1282. #if HTS_TILDE_SLASH
  1283.                 if (lien[strlen(lien)-1]!='/') {
  1284.                   char *a=lien+strlen(lien)-1;
  1285.                   // Èviter aussi index~1.html
  1286.                   while (((int) a>(int) lien) && (*a!='~') && (*a!='/') && (*a!='.')) a--;
  1287.                   if (*a=='~') {
  1288.                     strcatbuff(lien,"/");    // ajouter slash
  1289.                   }
  1290.                 }
  1291. #endif
  1292.                 
  1293.                 // APPLET CODE="mixer.MixerApplet.class" --> APPLET CODE="mixer/MixerApplet.class"
  1294.                 // yes, this is dirty
  1295.                 // but I'm so lazzy..
  1296.                 // and besides the java "code" convention is really a pain in html code
  1297.                 if (p_type==-1) {
  1298.                   char* a=strrchr(lien,'.');
  1299.                   add_class_dots_to_patch=0;
  1300.                   if (a) {
  1301.                     char* b;
  1302.                     do {
  1303.                       b=strchr(lien,'.');
  1304.                       if ((b != a) && (b)) {
  1305.                         add_class_dots_to_patch++;
  1306.                         *b='/';
  1307.                       }
  1308.                     } while((b != a) && (b));
  1309.                   }
  1310.                 }
  1311.                 
  1312.                 // Èliminer les Èventuels :80 (port par dÈfaut!)
  1313.                 if (link_has_authority(lien)) {
  1314.                   char * a;
  1315.                   a=strstr(lien,"//");    // "//" authority
  1316.                   if (a)
  1317.                     a+=2;
  1318.                   else
  1319.                     a=lien;
  1320.                   // while((*a) && (*a!='/') && (*a!=':')) a++;
  1321.                   a=jump_toport(a);
  1322.                   if (a) {  // port
  1323.                     int port=0;
  1324.                     int defport=80;
  1325.                     char* b=a+1;
  1326. #if HTS_USEOPENSSL
  1327.                     // FIXME
  1328.                     //if (strfield(adr, "https:")) {
  1329.                     //}
  1330. #endif
  1331.                     while(isdigit((unsigned char)*b)) { port*=10; port+=(int) (*b-'0'); b++; }
  1332.                     if (port==defport) {  // port 80, default - c'est dÈbile
  1333.                       char tempo[HTS_URLMAXSIZE*2];
  1334.                       tempo[0]='\0';
  1335.                       strncatbuff(tempo,lien,(int) (a - lien));
  1336.                       strcatbuff(tempo,a+3);  // sauter :80
  1337.                       strcpybuff(lien,tempo);
  1338.                     }
  1339.                   }
  1340.                 }
  1341.                 
  1342.                 // filtrer les parazites (mailto & cie)
  1343.                 /*
  1344.                 if (strfield(lien,"mailto:")) {  // ne pas traiter
  1345.                   error=1;
  1346.                 } else if (strfield(lien,"news:")) {  // ne pas traiter
  1347.                   error=1;
  1348.                 }
  1349.                 */
  1350.                 
  1351.                 // vÈrifier que l'on ne doit pas ajouter de .class
  1352.                 if (!error) {
  1353.                   if (add_class) {
  1354.                     char *a = lien+strlen(lien)-1;
  1355.                     while(( a > lien) && (*a!='/') && (*a!='.')) a--;
  1356.                     if (*a != '.')
  1357.                       strcatbuff(lien,".class");    // ajouter .class
  1358.                     else if (!strfield2(a,".class"))
  1359.                       strcatbuff(lien,".class");    // idem
  1360.                   }
  1361.                 }
  1362.                 
  1363.                 // si c'est un chemin, alors vÈrifier (toto/toto.html -> http://www/toto/)
  1364.                 if (!error) {
  1365.                   if ((opt.debug>1) && (opt.log!=NULL)) {
  1366.                     fspc(opt.log,"debug"); fprintf(opt.log,"position link check %s"LF,lien); test_flush;
  1367.                   }
  1368.                   
  1369.                   if ((p_type==2) || (p_type==-2)) {   // code ou codebase                        
  1370.                     // VÈrifier les codebase=applet (au lieu de applet/)
  1371.                     if (p_type==-2) {    // codebase
  1372.                       if (strnotempty(lien)) {
  1373.                         if (fil[strlen(lien)-1]!='/') {  // pas rÈpertoire
  1374.                           strcatbuff(lien,"/");
  1375.                         }
  1376.                       }
  1377.                     }
  1378.                     /* only one ending / (bug on some pages) */
  1379.                     if ((int)strlen(lien)>2) {
  1380.                       while( (lien[strlen(lien)-2]=='/') && ((int)strlen(lien)>2) )    /* double // (bug) */
  1381.                         lien[strlen(lien)-1]='\0';
  1382.                     }
  1383.                     // copier nom host si besoin est
  1384.                     if (!link_has_authority(lien)) {  // pas de http://
  1385.                       char adr2[HTS_URLMAXSIZE*2],fil2[HTS_URLMAXSIZE*2];  // ** euh ident_url_relatif??
  1386.                       if (ident_url_relatif(lien,urladr,urlfil,adr2,fil2)<0) {                        
  1387.                         error=1;
  1388.                       } else {
  1389.                         strcpybuff(lien,"http://");
  1390.                         strcatbuff(lien,adr2);
  1391.                         if (*fil2!='/')
  1392.                           strcatbuff(lien,"/");
  1393.                         strcatbuff(lien,fil2);
  1394.                         {
  1395.                           char* a;
  1396.                           a=lien+strlen(lien)-1;
  1397.                           while((*a) && (*a!='/') && ( a> lien)) a--;
  1398.                           if (*a=='/') {
  1399.                             *(a+1)='\0';
  1400.                           }
  1401.                         }
  1402.                         //char tempo[HTS_URLMAXSIZE*2];
  1403.                         //strcpybuff(tempo,"http://");
  1404.                         //strcatbuff(tempo,urladr);    // host
  1405.                         //if (*lien!='/')
  1406.                         //  strcatbuff(tempo,"/");
  1407.                         //strcatbuff(tempo,lien);
  1408.                         //strcpybuff(lien,tempo);
  1409.                       }
  1410.                     }
  1411.                     
  1412.                     if (!error) {  // pas d'erreur?
  1413.                       if (p_type==2) {   // code ET PAS codebase      
  1414.                         char* a=lien+strlen(lien)-1;
  1415.                         while( (a > lien) && (*a) && (*a!='/')) a--;
  1416.                         if (*a=='/')     // ok on a repÈrÈ le dernier /
  1417.                           *(a+1)='\0';   // couper
  1418.                         else {
  1419.                           *lien='\0';    // Èliminer
  1420.                           error=1;   // erreur, ne pas poursuivre
  1421.                         }      
  1422.                       }
  1423.                       
  1424.                       // stocker base ou codebase?
  1425.                       switch(p_type) {
  1426.                       case 2: { 
  1427.                         //if (*lien!='/') strcatbuff(base,"/");
  1428.                         strcpybuff(base,lien);
  1429.                               }
  1430.                         break;      // base
  1431.                       case -2: {
  1432.                         //if (*lien!='/') strcatbuff(codebase,"/");
  1433.                         strcpybuff(codebase,lien); 
  1434.                                }
  1435.                         break;  // base
  1436.                       }
  1437.                       
  1438.                       if ((opt.debug>1) && (opt.log!=NULL)) {
  1439.                         fspc(opt.log,"debug"); fprintf(opt.log,"code/codebase link %s base %s"LF,lien,base); test_flush;
  1440.                       }
  1441.                       //printf("base code: %s - %s\n",lien,base);
  1442.                     }
  1443.                     
  1444.                   } else {
  1445.                     char* _base;
  1446.                     if (p_type==-1)   // code (applet)
  1447.                       _base=codebase;
  1448.                     else
  1449.                       _base=base;
  1450.                     
  1451.                     
  1452.                     // ajouter chemin de base href..
  1453.                     if (strnotempty(_base)) {       // considÈrer base
  1454.                       if (!link_has_authority(lien)) {    // non absolue
  1455.                         //if (*lien!='/') {           // non absolu sur le site (/)
  1456.                         if ( ((int) strlen(_base)+(int) strlen(lien))<HTS_URLMAXSIZE) {
  1457.                           // mailto: and co: do NOT add base
  1458.                           if (ident_url_relatif(lien,urladr,urlfil,adr,fil)>=0) {
  1459.                             char tempo[HTS_URLMAXSIZE*2];
  1460.                             // base est absolue
  1461.                             strcpybuff(tempo,_base);
  1462.                             strcatbuff(tempo,lien + ((*lien=='/')?1:0) );
  1463.                             strcpybuff(lien,tempo);        // patcher en considÈrant base
  1464.                             // ** vÈrifier que ../ fonctionne (ne doit pas arriver mais bon..)
  1465.                             
  1466.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  1467.                               fspc(opt.log,"debug"); fprintf(opt.log,"link modified with code/codebase %s"LF,lien); test_flush;
  1468.                             }
  1469.                           }
  1470.                         } else {
  1471.                           error=1;    // erreur
  1472.                           if (opt.errlog) {
  1473.                             fspc(opt.errlog,"error"); fprintf(opt.errlog,"Link %s too long with base href"LF,lien);
  1474.                             test_flush;
  1475.                           }
  1476.                         }
  1477.                         //}
  1478.                       }
  1479.                     }
  1480.                     
  1481.                     
  1482.                   }
  1483.                   }
  1484.                   
  1485.                   
  1486.                   // transformer lien quelconque (http, relatif, etc) en une adresse
  1487.                   // et un chemin+fichier (adr,fil)
  1488.                   if (!error) {
  1489.                     int reponse;
  1490.                     if ((opt.debug>1) && (opt.log!=NULL)) {
  1491.                       fspc(opt.log,"debug"); fprintf(opt.log,"build relative link %s with %s%s"LF,lien,urladr,urlfil); test_flush;
  1492.                     }
  1493.                     if ((reponse=ident_url_relatif(lien,urladr,urlfil,adr,fil))<0) {                        
  1494.                       adr[0]='\0';    // erreur
  1495.                       if (reponse==-2) {
  1496.                         if (opt.errlog) {
  1497.                           fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s not caught (unknown ftp:// protocol)"LF,lien);
  1498.                           test_flush;
  1499.                         }
  1500.                       } else {
  1501.                         if ((opt.debug>1) && (opt.errlog!=NULL)) {
  1502.                           fspc(opt.errlog,"debug"); fprintf(opt.errlog,"ident_url_relatif failed for %s with %s%s"LF,lien,urladr,urlfil); test_flush;
  1503.                         }
  1504.                       }
  1505.                     }
  1506.                   } else {
  1507.                     if ((opt.debug>1) && (opt.log!=NULL)) {
  1508.                       fspc(opt.log,"debug"); fprintf(opt.log,"link %s not build, error detected before"LF,lien); test_flush;
  1509.                     }
  1510.                     adr[0]='\0';
  1511.                   }
  1512.                   
  1513. #if HTS_CHECK_STRANGEDIR
  1514.                   // !ATTENTION!
  1515.                   // Ici on teste les exotiques du genre www.truc.fr/machin (sans slash ‡ la fin)
  1516.                   // je n'ai pas encore trouvÈ le moyen de faire la diffÈrence entre un rÈpertoire
  1517.                   // et un fichier en http A PRIORI : je fais donc un test
  1518.                   // En cas de moved xxx, on recalcule adr et fil, tout simplement
  1519.                   // DEFAUT: test effectuÈ plusieurs fois! ‡ revoir!!!
  1520.                   if ((adr[0]!='\0') && (strcmp(adr,"file://") && (p_type!=2) && (p_type!=-2)) {
  1521.                     //## if ((adr[0]!='\0') && (adr[0]!=lOCAL_CHAR) && (p_type!=2) && (p_type!=-2)) {
  1522.                     if (fil[strlen(fil)-1]!='/') {  // pas rÈpertoire
  1523.                       if (ishtml(fil)==-2) {    // pas d'extension
  1524.                         char loc[HTS_URLMAXSIZE*2];  // Èventuelle nouvelle position
  1525.                         loc[0]='\0';
  1526.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  1527.                           fspc(opt.log,"debug"); fprintf(opt.log,"link-check-directory: %s%s"LF,adr,fil);
  1528.                           test_flush;
  1529.                         }
  1530.                         
  1531.                         // tester Èventuelle nouvelle position
  1532.                         switch (http_location(adr,fil,loc).statuscode) {
  1533.                         case 200: // ok au final
  1534.                           if (strnotempty(loc)) {  // a changÈ d'adresse
  1535.                             if (opt.errlog) {
  1536.                               fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s%s has moved to %s for %s%s"LF,adr,fil,loc,urladr,urlfil);
  1537.                               test_flush;
  1538.                             }
  1539.                             
  1540.                             // recalculer adr et fil!
  1541.                             if (ident_url_absolute(loc,adr,fil)==-1) {
  1542.                               adr[0]='\0';  // cancel
  1543.                               if ((opt.debug>1) && (opt.log!=NULL)) {
  1544.                                 fspc(opt.log,"debug"); fprintf(opt.log,"link-check-dir: %s%s"LF,adr,fil);
  1545.                                 test_flush;
  1546.                               }
  1547.                             }
  1548.                             
  1549.                           }
  1550.                           break;
  1551.                         case -2: case -3:  // timeout ou erreur grave
  1552.                           if (opt.errlog) {
  1553.                             fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Connection too slow for testing link %s%s (from %s%s)"LF,adr,fil,urladr,urlfil);
  1554.                             test_flush;
  1555.                           }
  1556.                           
  1557.                           break;
  1558.                         }
  1559.                         
  1560.                       }
  1561.                     } 
  1562.                   }
  1563. #endif
  1564.                   
  1565.                   // Le lien doit juste Ítre rÈÈcrit, mais ne doit pas gÈnÈrer un lien
  1566.                   // exemple: <FORM ACTION="url_cgi">
  1567.                   if (p_nocatch) {
  1568.                     forbidden_url=1;    // interdire rÈcupÈration du lien
  1569.                     if ((opt.debug>1) && (opt.log!=NULL)) {
  1570.                       fspc(opt.log,"debug"); fprintf(opt.log,"link forced external at %s%s"LF,adr,fil);
  1571.                       test_flush;
  1572.                     }
  1573.                   }
  1574.                   
  1575.                   // Tester si un lien doit Ítre acceptÈ ou refusÈ (wizard)
  1576.                   // forbidden_url=1 : lien refusÈ
  1577.                   // forbidden_url=0 : lien acceptÈ
  1578.                   //if ((ptr>0) && (p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  1579.                   if ((p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  1580.                     if (!p_nocatch) {
  1581.                       if (adr[0]!='\0') {          
  1582.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  1583.                           fspc(opt.log,"debug"); fprintf(opt.log,"wizard link test at %s%s.."LF,adr,fil);
  1584.                           test_flush;
  1585.                         }
  1586.                         forbidden_url=hts_acceptlink(&opt,ptr,lien_tot,liens,
  1587.                           adr,fil,
  1588.                           &filters,&filptr,opt.maxfilter,
  1589.                           &robots,
  1590.                           &set_prio_to,
  1591.                           &just_test_it);
  1592.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  1593.                           fspc(opt.log,"debug"); fprintf(opt.log,"result for wizard link test: %d"LF,forbidden_url);
  1594.                           test_flush;
  1595.                         }
  1596.                       }
  1597.                     }
  1598.                   }
  1599.                   
  1600.                   // calculer meme_adresse
  1601.                   meme_adresse=strfield2(jump_identification(adr),jump_identification(urladr));
  1602.                   
  1603.                   
  1604.                   
  1605.                   // DÈbut partie sauvegarde
  1606.                   
  1607.                   // ici on forme le nom du fichier ‡ sauver, et on patche l'URL
  1608.                   if (adr[0]!='\0') {
  1609.                     // savename: simplifier les ../ et autres joyeusetÈs
  1610.                     char save[HTS_URLMAXSIZE*2];
  1611.                     int r_sv=0;
  1612.                     // En cas de moved, adresse premiËre
  1613.                     char former_adr[HTS_URLMAXSIZE*2];
  1614.                     char former_fil[HTS_URLMAXSIZE*2];
  1615.                     //
  1616.                     save[0]='\0'; former_adr[0]='\0'; former_fil[0]='\0';
  1617.                     //
  1618.                     
  1619.                     // nom du chemin ‡ sauver si on doit le calculer
  1620.                     // note: url_savename peut dÈcider de tester le lien si il le trouve
  1621.                     // suspect, et modifier alors adr et fil
  1622.                     // dans ce cas on aura une rÈfÈrence directe au lieu des traditionnels
  1623.                     // moved en cascade (impossible ‡ reproduire ‡ priori en local, lorsque des fichiers
  1624.                     // gif sont impliquÈs par exemple)
  1625.                     if ((p_type!=2) && (p_type!=-2)) {  // pas base href ou codebase
  1626.                       if (forbidden_url!=1) {
  1627.                         char last_adr[HTS_URLMAXSIZE*2];
  1628.                         last_adr[0]='\0';
  1629.                         //char last_fil[HTS_URLMAXSIZE*2]="";
  1630.                         strcpybuff(last_adr,adr);    // ancienne adresse
  1631.                         //strcpybuff(last_fil,fil);    // ancien chemin
  1632.                         r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,&opt,liens,lien_tot,back,back_max,&cache,&hash,ptr,numero_passe);
  1633.                         if (strcmp(jump_identification(last_adr),jump_identification(adr)) != 0) {  // a changÈ
  1634.                           
  1635.                           // 2e test si moved
  1636.                           
  1637.                           // Tester si un lien doit Ítre acceptÈ ou refusÈ (wizard)
  1638.                           // forbidden_url=1 : lien refusÈ
  1639.                           // forbidden_url=0 : lien acceptÈ
  1640.                           if ((ptr>0) && (p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  1641.                             if (!p_nocatch) {
  1642.                               if (adr[0]!='\0') {          
  1643.                                 if ((opt.debug>1) && (opt.log!=NULL)) {
  1644.                                   fspc(opt.log,"debug"); fprintf(opt.log,"wizard moved link retest at %s%s.."LF,adr,fil);
  1645.                                   test_flush;
  1646.                                 }
  1647.                                 forbidden_url=hts_acceptlink(&opt,ptr,lien_tot,liens,
  1648.                                   adr,fil,
  1649.                                   &filters,&filptr,opt.maxfilter,
  1650.                                   &robots,
  1651.                                   &set_prio_to,
  1652.                                   &just_test_it);
  1653.                                 if ((opt.debug>1) && (opt.log!=NULL)) {
  1654.                                   fspc(opt.log,"debug"); fprintf(opt.log,"result for wizard moved link retest: %d"LF,forbidden_url);
  1655.                                   test_flush;
  1656.                                 }
  1657.                               }
  1658.                             }
  1659.                           }
  1660.                           
  1661.                           //import_done=1;    // c'est un import!
  1662.                           meme_adresse=0;   // on a changÈ
  1663.                         }
  1664.                       } else {
  1665.                         strcpybuff(save,"");  // dummy
  1666.                       }
  1667.                     }
  1668.                     if (r_sv!=-1) {  // pas d'erreur, on continue
  1669.                       /* log */
  1670.                       if ((opt.debug>1) && (opt.log!=NULL)) {
  1671.                         fspc(opt.log,"debug");
  1672.                         if (forbidden_url!=1) {    // le lien va Ítre chargÈ
  1673.                           if ((p_type==2) || (p_type==-2)) {  // base href ou codebase, pas un lien
  1674.                             fprintf(opt.log,"Code/Codebase: %s%s"LF,adr,fil);
  1675.                           } else if ((opt.getmode & 4)==0) {
  1676.                             fprintf(opt.log,"Record: %s%s -> %s"LF,adr,fil,save);
  1677.                           } else {
  1678.                             if (!ishtml(fil))
  1679.                               fprintf(opt.log,"Record after: %s%s -> %s"LF,adr,fil,save);
  1680.                             else
  1681.                               fprintf(opt.log,"Record: %s%s -> %s"LF,adr,fil,save);
  1682.                           } 
  1683.                         } else
  1684.                           fprintf(opt.log,"External: %s%s"LF,adr,fil);
  1685.                         test_flush;
  1686.                       }
  1687.                       /* FIN log */
  1688.                       
  1689.                       // Ècrire lien
  1690.                       if ((p_type==2) || (p_type==-2)) {  // base href ou codebase, sauter
  1691.                         lastsaved=eadr-1+1;  // sauter "
  1692.                       }
  1693.                       /* */
  1694.                       else if (opt.urlmode==0) {    // URL absolue dans tous les cas
  1695.                         if ((opt.getmode & 1) && (ptr>0)) {    // ecrire les html
  1696.                           if (!link_has_authority(adr)) {
  1697.                             HT_ADD("http://");
  1698.                           } else {
  1699.                             char* aut = strstr(adr, "//");
  1700.                             if (aut) {
  1701.                               char tmp[256];
  1702.                               tmp[0]='\0';
  1703.                               strncatbuff(tmp, adr, (int) (aut - adr));   // scheme
  1704.                               HT_ADD(tmp);          // Protocol
  1705.                               HT_ADD("//");
  1706.                             }
  1707.                           }
  1708.  
  1709.                           if (!opt.passprivacy) {
  1710.                             HT_ADD(jump_protocol(adr));           // Password
  1711.                           } else {
  1712.                             HT_ADD(jump_identification(adr));     // No Password
  1713.                           }
  1714.                           if (*fil!='/')
  1715.                             HT_ADD("/");
  1716.                           HT_ADD(fil);
  1717.                         }
  1718.                         lastsaved=eadr-1;    // dernier Ècrit+1 (enfin euh apres on fait un ++ alors hein)
  1719.                       /* */
  1720.                       } else if (opt.urlmode >= 4) {    // ne rien faire dans tous les cas!
  1721.                       /* */
  1722.                       /* leave the link 'as is' */
  1723.                       /* Sinon, dÈpend de interne/externe */
  1724.                       } else if (forbidden_url==1) {    // le lien ne sera pas chargÈ, rÈfÈrence externe!
  1725.                         if ((opt.getmode & 1) && (ptr>0)) {
  1726.                           if (p_type!=-1) {     // pas que le nom de fichier (pas classe java)
  1727.                             if (!opt.external) {
  1728.                               if (!link_has_authority(adr)) {
  1729.                                 HT_ADD("http://");
  1730.                                 if (!opt.passprivacy) {
  1731.                                   HT_ADD(adr);     // Password
  1732.                                 } else {
  1733.                                   HT_ADD(jump_identification(adr));     // No Password
  1734.                                 }
  1735.                                 if (*fil!='/')
  1736.                                   HT_ADD("/");
  1737.                                 HT_ADD(fil);
  1738.                               } else {
  1739.                                 char* aut = strstr(adr, "//");
  1740.                                 if (aut) {
  1741.                                   char tmp[256];
  1742.                                   tmp[0]='\0';
  1743.                                   strncatbuff(tmp, adr, (int) (aut - adr));   // scheme
  1744.                                   HT_ADD(tmp);          // Protocol
  1745.                                   HT_ADD("//");
  1746.                                   if (!opt.passprivacy) {
  1747.                                     HT_ADD(jump_protocol(adr));          // Password
  1748.                                   } else {
  1749.                                     HT_ADD(jump_identification(adr));     // No Password
  1750.                                   }
  1751.                                   if (*fil!='/')
  1752.                                     HT_ADD("/");
  1753.                                   HT_ADD(fil);
  1754.                                 }
  1755.                               }
  1756.                               //
  1757.                             } else {    // fichier/page externe, mais on veut gÈnÈrer une erreur
  1758.                               //
  1759.                               int patch_it=0;
  1760.                               int add_url=0;
  1761.                               char* cat_name=NULL;
  1762.                               char* cat_data=NULL;
  1763.                               int cat_nb=0;
  1764.                               int cat_data_len=0;
  1765.                               
  1766.                               // ajouter lien external
  1767.                               switch ( (link_has_authority(adr)) ? 1 : ( (fil[strlen(fil)-1]=='/')?1:(ishtml(fil))  ) ) {
  1768.                               case 1: case -2:       // html ou rÈpertoire
  1769.                                 if (opt.getmode & 1) {  // sauver html
  1770.                                   patch_it=1;   // redirect
  1771.                                   add_url=1;    // avec link?
  1772.                                   cat_name="external.html";
  1773.                                   cat_nb=0;
  1774.                                   cat_data=HTS_DATA_UNKNOWN_HTML;
  1775.                                   cat_data_len=HTS_DATA_UNKNOWN_HTML_LEN;
  1776.                                 }
  1777.                                 break;
  1778.                               default:    // inconnu
  1779.                                 // asp, cgi..
  1780.                                 if (is_dyntype(get_ext(fil))) {
  1781.                                   patch_it=1;   // redirect
  1782.                                   add_url=1;    // avec link?
  1783.                                   cat_name="external.html";
  1784.                                   cat_nb=0;
  1785.                                   cat_data=HTS_DATA_UNKNOWN_HTML;
  1786.                                   cat_data_len=HTS_DATA_UNKNOWN_HTML_LEN;
  1787.                                 } else if ( (strfield2(fil+max(0,(int)strlen(fil)-4),".gif")) 
  1788.                                   || (strfield2(fil+max(0,(int)strlen(fil)-4),".jpg")) 
  1789.                                   || (strfield2(fil+max(0,(int)strlen(fil)-4),".xbm")) 
  1790.                                   || (ishtml(fil)!=0) ) {
  1791.                                   patch_it=1;   // redirect
  1792.                                   add_url=1;    // avec link aussi
  1793.                                   cat_name="external.gif";
  1794.                                   cat_nb=1;
  1795.                                   cat_data=HTS_DATA_UNKNOWN_GIF;
  1796.                                   cat_data_len=HTS_DATA_UNKNOWN_GIF_LEN;
  1797.                                 }
  1798.                                 break;
  1799.                               }// html,gif
  1800.                               
  1801.                               if (patch_it) {
  1802.                                 char save[HTS_URLMAXSIZE*2];
  1803.                                 char tempo[HTS_URLMAXSIZE*2];
  1804.                                 strcpybuff(save,opt.path_html);
  1805.                                 strcatbuff(save,cat_name);
  1806.                                 if (lienrelatif(tempo,save,savename)==0) {
  1807.                                   if (!no_esc_utf)
  1808.                                     escape_uri(tempo);     // escape with %xx
  1809.                                   else
  1810.                                     escape_uri_utf(tempo);     // escape with %xx
  1811.                                   HT_ADD(tempo);    // page externe
  1812.                                   if (add_url) {
  1813.                                     HT_ADD("?link=");    // page externe
  1814.                                     
  1815.                                     // same as above
  1816.                                     if (!link_has_authority(adr)) {
  1817.                                       HT_ADD("http://");
  1818.                                       if (!opt.passprivacy) {
  1819.                                         HT_ADD(adr);     // Password
  1820.                                       } else {
  1821.                                         HT_ADD(jump_identification(adr));     // No Password
  1822.                                       }
  1823.                                       if (*fil!='/')
  1824.                                         HT_ADD("/");
  1825.                                       HT_ADD(fil);
  1826.                                     } else {
  1827.                                       char* aut = strstr(adr, "//");
  1828.                                       if (aut) {
  1829.                                         char tmp[256];
  1830.                                         tmp[0]='\0';
  1831.                                         strncatbuff(tmp, adr, (int) (aut - adr) + 2);   // scheme
  1832.                                         HT_ADD(tmp);
  1833.                                         if (!opt.passprivacy) {
  1834.                                           HT_ADD(jump_protocol(adr));          // Password
  1835.                                         } else {
  1836.                                           HT_ADD(jump_identification(adr));     // No Password
  1837.                                         }
  1838.                                         if (*fil!='/')
  1839.                                           HT_ADD("/");
  1840.                                         HT_ADD(fil);
  1841.                                       }
  1842.                                     }
  1843.                                     //
  1844.  
  1845.                                   }
  1846.                                 }
  1847.                                 
  1848.                                 // Ècrire fichier?
  1849.                                 if (verif_external(cat_nb,1)) {
  1850.                                 //if (!fexist(fconcat(opt.path_html,cat_name))) {
  1851.                                   FILE* fp = filecreate(fconcat(opt.path_html,cat_name));
  1852.                                   if (fp) {
  1853.                                     if (cat_data_len==0) {   // texte
  1854.                                       verif_backblue(opt.path_html);
  1855.                                       fprintf(fp,"%s%s","<!-- Created by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"LF,cat_data);
  1856.                                     } else {                    // data
  1857.                                       fwrite(cat_data,cat_data_len,1,fp);
  1858.                                     }
  1859.                                     fclose(fp);
  1860.                                     usercommand(0,NULL,fconcat(opt.path_html,cat_name));
  1861.                                   }
  1862.                                 }
  1863.                               }  else {    // Ècrire normalement le nom de fichier
  1864.                                 HT_ADD("http://");
  1865.                                 if (!opt.passprivacy) {
  1866.                                   HT_ADD(adr);       // Password
  1867.                                 } else {
  1868.                                   HT_ADD(jump_identification(adr));       // No Password
  1869.                                 }
  1870.                                 if (*fil!='/')
  1871.                                   HT_ADD("/");
  1872.                                 HT_ADD(fil);
  1873.                               }// patcher?
  1874.                             }  // external
  1875.                           } else {  // que le nom de fichier (classe java)
  1876.                             // en gros recopie de plus bas: copier codebase et base
  1877.                             if (p_flush) {
  1878.                               char tempo[HTS_URLMAXSIZE*2];    // <-- ajoutÈ
  1879.                               char tempo_pat[HTS_URLMAXSIZE*2];
  1880.  
  1881.                               // Calculer chemin
  1882.                               tempo_pat[0]='\0';
  1883.                               strcpybuff(tempo,fil);  // <-- ajoutÈ
  1884.                               {
  1885.                                 char* a=strrchr(tempo,'/');
  1886.  
  1887.                                 // Example: we converted code="x.y.z.foo.class" into "x/y/z/foo.class"
  1888.                                 // we have to do the contrary now
  1889.                                 if (add_class_dots_to_patch>0) {
  1890.                                   while( (add_class_dots_to_patch>0) && (a) ) {
  1891.                                     *a='.';     // convert "false" java / into .
  1892.                                     add_class_dots_to_patch--;
  1893.                                     a=strrchr(tempo,'/');
  1894.                                   }
  1895.                                   // if add_class_dots_to_patch, this is because there is a problem!!
  1896.                                   if (add_class_dots_to_patch) {
  1897.                                     if (opt.errlog) {
  1898.                                       fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Error: can not rewind java path %s, check html code"LF,tempo);
  1899.                                       test_flush;
  1900.                                     }
  1901.                                   }
  1902.                                 }
  1903.  
  1904.                                 // Cut path/filename
  1905.                                 if (a) {
  1906.                                   char tempo2[HTS_URLMAXSIZE*2];
  1907.                                   strcpybuff(tempo2,a+1);         // FICHIER
  1908.                                   strncatbuff(tempo_pat,tempo,(int) (a - tempo)+1);  // chemin
  1909.                                   strcpybuff(tempo,tempo2);                     // fichier
  1910.                                 }
  1911.                               }
  1912.                               
  1913.                               // Èrire codebase="chemin"
  1914.                               if ((opt.getmode & 1) && (ptr>0)) {
  1915.                                 char tempo4[HTS_URLMAXSIZE*2];
  1916.                                 tempo4[0]='\0';
  1917.                                 
  1918.                                 if (strnotempty(tempo_pat)) {
  1919.                                   HT_ADD("codebase=\"http://");
  1920.                                   if (!opt.passprivacy) {
  1921.                                     HT_ADD(adr);  // Password
  1922.                                   } else {
  1923.                                     HT_ADD(jump_identification(adr));  // No Password
  1924.                                   }
  1925.                                   if (*tempo_pat!='/') HT_ADD("/");
  1926.                                   HT_ADD(tempo_pat);
  1927.                                   HT_ADD("\" ");
  1928.                                 }
  1929.                                 
  1930.                                 strncatbuff(tempo4,lastsaved,(int) (p_flush - lastsaved));
  1931.                                 HT_ADD(tempo4);    // refresh code="
  1932.                                 HT_ADD(tempo);
  1933.                               }
  1934.                             }
  1935.                           }
  1936.                         }
  1937.                         lastsaved=eadr-1;
  1938.                       }
  1939.                       /*
  1940.                       else if (opt.urlmode==1) {    // ABSOLU, c'est le cas le moins courant
  1941.                       //  NE FONCTIONNE PAS!!  (et est inutile)
  1942.                       if ((opt.getmode & 1) && (ptr>0)) {    // ecrire les html
  1943.                       // Ècrire le lien modifiÈ, absolu
  1944.                       HT_ADD("file:");
  1945.                       if (*save=='/')
  1946.                       HT_ADD(save+1)
  1947.                       else
  1948.                       HT_ADD(save)
  1949.                       }
  1950.                       lastsaved=eadr-1;    // dernier Ècrit+1 (enfin euh apres on fait un ++ alors hein)
  1951.                       }
  1952.                       */
  1953.                       else if (opt.urlmode==3) {    // URI absolue /
  1954.                         if ((opt.getmode & 1) && (ptr>0)) {    // ecrire les html
  1955.                           HT_ADD(fil);
  1956.                         }
  1957.                         lastsaved=eadr-1;    // dernier Ècrit+1 (enfin euh apres on fait un ++ alors hein)
  1958.                       }
  1959.                       else if (opt.urlmode==2) {  // RELATIF
  1960.                         char tempo[HTS_URLMAXSIZE*2];
  1961.                         tempo[0]='\0';
  1962.                         // calculer le lien relatif
  1963.                         
  1964.                         if (lienrelatif(tempo,save,savename)==0) {
  1965.                           if (!no_esc_utf)
  1966.                             escape_uri(tempo);     // escape with %xx
  1967.                           else
  1968.                             escape_uri_utf(tempo);     // escape with %xx
  1969.                           if ((opt.debug>1) && (opt.log!=NULL)) {
  1970.                             fspc(opt.log,"debug"); fprintf(opt.log,"relative link at %s build with %s and %s: %s"LF,adr,save,savename,tempo);
  1971.                             test_flush;
  1972.                           }
  1973.                           
  1974.                           // lien applet (code) - il faut placer un codebase avant
  1975.                           if (p_type==-1) {  // que le nom de fichier
  1976.                             
  1977.                             if (p_flush) {
  1978.                               char tempo_pat[HTS_URLMAXSIZE*2];
  1979.                               tempo_pat[0]='\0';
  1980.                               {
  1981.                                 char* a=strrchr(tempo,'/');
  1982.  
  1983.                                 // Example: we converted code="x.y.z.foo.class" into "x/y/z/foo.class"
  1984.                                 // we have to do the contrary now
  1985.                                 if (add_class_dots_to_patch>0) {
  1986.                                   while( (add_class_dots_to_patch>0) && (a) ) {
  1987.                                     *a='.';     // convert "false" java / into .
  1988.                                     add_class_dots_to_patch--;
  1989.                                     a=strrchr(tempo,'/');
  1990.                                   }
  1991.                                   // if add_class_dots_to_patch, this is because there is a problem!!
  1992.                                   if (add_class_dots_to_patch) {
  1993.                                     if (opt.errlog) {
  1994.                                       fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Error: can not rewind java path %s, check html code"LF,tempo);
  1995.                                       test_flush;
  1996.                                     }
  1997.                                   }
  1998.                                 }
  1999.  
  2000.                                 if (a) {
  2001.                                   char tempo2[HTS_URLMAXSIZE*2];
  2002.                                   strcpybuff(tempo2,a+1);
  2003.                                   strncatbuff(tempo_pat,tempo,(int) (a - tempo)+1);  // chemin
  2004.                                   strcpybuff(tempo,tempo2);                     // fichier
  2005.                                 }
  2006.                               }
  2007.                               
  2008.                               // Èrire codebase="chemin"
  2009.                               if ((opt.getmode & 1) && (ptr>0)) {
  2010.                                 char tempo4[HTS_URLMAXSIZE*2];
  2011.                                 tempo4[0]='\0';
  2012.                                 
  2013.                                 if (strnotempty(tempo_pat)) {
  2014.                                   HT_ADD("codebase=\"");
  2015.                                   HT_ADD(tempo_pat);
  2016.                                   HT_ADD("\" ");
  2017.                                 }
  2018.                                 
  2019.                                 strncatbuff(tempo4,lastsaved,(int) (p_flush - lastsaved));
  2020.                                 HT_ADD(tempo4);    // refresh code="
  2021.                               }
  2022.                             }
  2023.                             //lastsaved=adr;    // dernier Ècrit+1
  2024.                           }                              
  2025.                           
  2026.                           if ((opt.getmode & 1) && (ptr>0)) {
  2027.                             // Ècrire le lien modifiÈ, relatif
  2028.                             HT_ADD(tempo);
  2029.  
  2030.                             // Add query-string, for informational purpose only
  2031.                             // Useless, because all parameters-pages are saved into different targets
  2032.                             if (opt.includequery) {
  2033.                               char* a=strchr(lien,'?');
  2034.                               if (a) {
  2035.                                 HT_ADD(a);
  2036.                               }
  2037.                             }
  2038.                           }
  2039.                           lastsaved=eadr-1;    // dernier Ècrit+1 (enfin euh apres on fait un ++ alors hein)
  2040.                         } else {
  2041.                           if (opt.errlog) {
  2042.                             fprintf(opt.errlog,"Error building relative link %s and %s"LF,save,savename);
  2043.                             test_flush;
  2044.                           }
  2045.                         }
  2046.                       }  // sinon le lien sera Ècrit normalement
  2047.                       
  2048.                       
  2049. #if 0
  2050.                       if (fexist(save)) {    // le fichier existe..
  2051.                         adr[0]='\0';
  2052.                         //if ((opt.debug>0) && (opt.log!=NULL)) {
  2053.                         if (opt.errlog) {
  2054.                           fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link has already been written on disk, cancelled: %s"LF,save);
  2055.                           test_flush;
  2056.                         }
  2057.                       }
  2058. #endif                            
  2059.                       
  2060.                       /* Security check */
  2061.                       if (strlen(save) >= HTS_URLMAXSIZE) {
  2062.                         adr[0]='\0';
  2063.                         if (opt.errlog) {
  2064.                           fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link is too long: %s"LF,save);
  2065.                           test_flush;
  2066.                         }
  2067.                       }
  2068.  
  2069.                       if ((adr[0]!='\0') && (p_type!=2) && (p_type!=-2) && ( (forbidden_url!=1) || (just_test_it))) {  // si le fichier n'existe pas, ajouter ‡ la liste                            
  2070.                         // n'y a-t-il pas trop de liens?
  2071.                         if (lien_tot+1 >= lien_max-4) {    // trop de liens!
  2072.                           printf("PANIC! : Too many URLs : >%d [%d]\n",lien_tot,__LINE__);
  2073.                           if (opt.errlog) {
  2074.                             fprintf(opt.errlog,LF"Too many URLs, giving up..(>%d)"LF,lien_max);
  2075.                             fprintf(opt.errlog,"To avoid that: use #L option for more links (example: -#L1000000)"LF);
  2076.                             test_flush;
  2077.                           }
  2078.                           if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  2079.                           XH_uninit;   // dÈsallocation mÈmoire & buffers
  2080.                           return 0;
  2081.                           
  2082.                         } else {    // noter le lien sur la listes des liens ‡ charger
  2083.                           int pass_fix,dejafait=0;
  2084.                           
  2085.                           // Calculer la prioritÈ de ce lien
  2086.                           if ((opt.getmode & 4)==0) {    // traiter html aprËs
  2087.                             pass_fix=0;
  2088.                           } else {    // vÈrifier que ce n'est pas un !html
  2089.                             if (!ishtml(fil))
  2090.                               pass_fix=1;        // prioritÈ infÈrieure (traiter aprËs)
  2091.                             else
  2092.                               pass_fix=max(0,numero_passe);    // prioritÈ normale
  2093.                           }
  2094.                           
  2095.                           /* If the file seems to be an html file, get depth-1 */
  2096.                           /*
  2097.                           if (strnotempty(save)) {
  2098.                             if (ishtml(save) == 1) {
  2099.                               // descore_prio = 2;
  2100.                             } else {
  2101.                               // descore_prio = 1;
  2102.                             }
  2103.                           }
  2104.                           */
  2105.                           
  2106.                           // vÈrifier que le lien n'a pas dÈja ÈtÈ notÈ
  2107.                           // si c'est le cas, alors il faut s'assurer que la prioritÈ associÈe
  2108.                           // au fichier est la plus grande des deux prioritÈs
  2109.                           //
  2110.                           // On part de la fin et on essaye de se presser (Èconomise temps machine)
  2111. #if HTS_HASH
  2112.                           {
  2113.                             int i=hash_read(&hash,save,"",0);      // lecture type 0 (sav)
  2114.                             if (i>=0) {
  2115.                               liens[i]->depth=maximum(liens[i]->depth,liens[ptr]->depth - 1);
  2116.                               dejafait=1;
  2117.                             }
  2118.                           }
  2119. #else
  2120.                           {
  2121.                             int l;
  2122.                             int i;
  2123.                             l=strlen(save);  // opti
  2124.                             for(i=lien_tot-1;(i>=0) && (dejafait==0);i--) {
  2125.                               if (liens[i]->sav_len==l) {    // mÍme taille de chaÓne
  2126.                                 if (strcmp(liens[i]->sav,save)==0) {    // existe dÈja
  2127.                                   liens[i]->depth=maximum(liens[i]->depth,liens[ptr]->depth - 1);
  2128.                                   dejafait=1;
  2129.                                 }
  2130.                               }
  2131.                             }
  2132.                           }
  2133. #endif
  2134.                           
  2135.                           // le lien n'a jamais ÈtÈ crÈÈ.
  2136.                           // cette fois ci, on le crÈe!
  2137.                           if (!dejafait) {                                
  2138.                             //
  2139.                             // >>>> CREER LE LIEN <<<<
  2140.                             //
  2141.                             // enregistrer lien ‡ charger
  2142.                             //liens[lien_tot]->adr[0]=liens[lien_tot]->fil[0]=liens[lien_tot]->sav[0]='\0';
  2143.                             // mÍme adresse: l'objet pËre est l'objet pËre de l'actuel
  2144.                             
  2145.                             // DEBUT ROBOTS.TXT AJOUT
  2146.                             if (!just_test_it) {
  2147.                               if (
  2148.                                 (!strfield(adr,"ftp://"))         // non ftp
  2149.                              && (!strfield(adr,"file://")) ) {    // non file
  2150.                                 if (opt.robots) {    // rÈcupÈrer robots
  2151.                                   if (ishtml(fil)!=0) {                       // pas la peine pour des fichiers isolÈs
  2152.                                     if (checkrobots(&robots,adr,"") != -1) {    // robots.txt ?
  2153.                                       checkrobots_set(&robots,adr,"");          // ajouter entrÈe vide
  2154.                                       if (checkrobots(&robots,adr,"") == -1) {    // robots.txt ?
  2155.                                         // enregistrer robots.txt (MACRO)
  2156.                                         liens_record(adr,"/robots.txt","","","");
  2157.                                         if (liens[lien_tot]==NULL) {  // erreur, pas de place rÈservÈe
  2158.                                           printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  2159.                                           if (opt.errlog) { 
  2160.                                             fprintf(opt.errlog,"Not enough memory, can not re-allocate %d bytes"LF,(int)((add_tab_alloc+1)*sizeof(lien_url)));
  2161.                                             test_flush;
  2162.                                           }
  2163.                                           if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  2164.                                           XH_uninit;    // dÈsallocation mÈmoire & buffers
  2165.                                           return 0;
  2166.                                         }  
  2167.                                         liens[lien_tot]->testmode=0;          // pas mode test
  2168.                                         liens[lien_tot]->link_import=0;       // pas mode import     
  2169.                                         liens[lien_tot]->premier=lien_tot;
  2170.                                         liens[lien_tot]->precedent=ptr;
  2171.                                         liens[lien_tot]->depth=0;
  2172.                                         liens[lien_tot]->pass2=max(0,numero_passe);
  2173.                                         liens[lien_tot]->retry=0;
  2174.                                         lien_tot++;  // UN LIEN DE PLUS
  2175. #if DEBUG_ROBOTS
  2176.                                         printf("robots.txt: added file robots.txt for %s\n",adr);
  2177. #endif
  2178.                                         if ((opt.debug>1) && (opt.log!=NULL)) {
  2179.                                           fspc(opt.log,"debug"); fprintf(opt.log,"robots.txt added at %s"LF,adr);
  2180.                                           test_flush;
  2181.                                         }
  2182.                                       } else {
  2183.                                         if (opt.errlog) {   
  2184.                                           fprintf(opt.errlog,"Unexpected robots.txt error at %d"LF,__LINE__);
  2185.                                           test_flush;
  2186.                                         }
  2187.                                       }
  2188.                                     }
  2189.                                   }
  2190.                                 }
  2191.                               }
  2192.                             }
  2193.                             // FIN ROBOTS.TXT AJOUT
  2194.                             
  2195.                             // enregistrer (MACRO)
  2196.                             liens_record(adr,fil,save,former_adr,former_fil);
  2197.                             if (liens[lien_tot]==NULL) {  // erreur, pas de place rÈservÈe
  2198.                               printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  2199.                               if (opt.errlog) { 
  2200.                                 fprintf(opt.errlog,"Not enough memory, can not re-allocate %d bytes"LF,(int)((add_tab_alloc+1)*sizeof(lien_url)));
  2201.                                 test_flush;
  2202.                               }
  2203.                               if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  2204.                               XH_uninit;    // dÈsallocation mÈmoire & buffers
  2205.                               return 0;
  2206.                             }  
  2207.                             
  2208.                             // mode test?
  2209.                             if (!just_test_it)
  2210.                               liens[lien_tot]->testmode=0;          // pas mode test
  2211.                             else
  2212.                               liens[lien_tot]->testmode=1;          // mode test
  2213.                             if (!import_done)
  2214.                               liens[lien_tot]->link_import=0;       // pas mode import
  2215.                             else
  2216.                               liens[lien_tot]->link_import=1;       // mode import
  2217.                             // Ècrire autres paramËtres de la structure-lien
  2218.                             if ((meme_adresse) && (!import_done) && (liens[ptr]->premier != 0))
  2219.                               liens[lien_tot]->premier=liens[ptr]->premier;
  2220.                             else    // sinon l'objet pËre est le prÈcÈdent lui mÍme
  2221.                               liens[lien_tot]->premier=lien_tot;
  2222.                             // liens[lien_tot]->premier=ptr;
  2223.                             
  2224.                             liens[lien_tot]->precedent=ptr;
  2225.                             // noter la prioritÈ
  2226.                             if (!set_prio_to)
  2227.                               liens[lien_tot]->depth=liens[ptr]->depth - 1;
  2228.                             else
  2229.                               liens[lien_tot]->depth=max(0,min(liens[ptr]->depth-1,set_prio_to-1));         // PRIORITE NULLE (catch page)
  2230.                             // noter pass
  2231.                             liens[lien_tot]->pass2=pass_fix;
  2232.                             liens[lien_tot]->retry=opt.retry;
  2233.                             
  2234.                             //strcpybuff(liens[lien_tot]->adr,adr);
  2235.                             //strcpybuff(liens[lien_tot]->fil,fil);
  2236.                             //strcpybuff(liens[lien_tot]->sav,save); 
  2237.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  2238.                               if (!just_test_it) {
  2239.                                 fspc(opt.log,"debug"); fprintf(opt.log,"OK, NOTE: %s%s -> %s"LF,liens[lien_tot]->adr,liens[lien_tot]->fil,liens[lien_tot]->sav);
  2240.                               } else {
  2241.                                 fspc(opt.log,"debug"); fprintf(opt.log,"OK, TEST: %s%s"LF,liens[lien_tot]->adr,liens[lien_tot]->fil);
  2242.                               }
  2243.                               test_flush;
  2244.                             }
  2245.                             
  2246.                             lien_tot++;  // UN LIEN DE PLUS
  2247.                           } else { // if !dejafait
  2248.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  2249.                               fspc(opt.log,"debug"); fprintf(opt.log,"link has already been recorded, cancelled: %s"LF,save);
  2250.                               test_flush;
  2251.                             }
  2252.                             
  2253.                           }
  2254.                           
  2255.                           
  2256.                         }   // si pas trop de liens
  2257.                       }   // si adr[0]!='\0'
  2258.                       
  2259.                       
  2260.                     }  // if adr[0]!='\0' 
  2261.                     
  2262.                   }  // if adr[0]!='\0'
  2263.                   
  2264.                 }    // if strlen(lien)>0
  2265.                 
  2266.               }   // if ok==0      
  2267.               
  2268.               adr=eadr-1;  // ** sauter
  2269.               
  2270.             }  // if (p) 
  2271.             
  2272.           }  // si '<' ou '>'
  2273.           
  2274.           // plus loin
  2275.           adr++;
  2276.  
  2277.  
  2278.           /* Otimization: if we are scanning in HTML data (not in tag or script), 
  2279.           then jump to the next starting tag */
  2280.           if (ptr>0) {
  2281.             if ( (!intag)         /* Not in tag */
  2282.               && (!inscript)      /* Not in (java)script */
  2283.               && (!incomment)     /* Not in comment (<!--) */
  2284.               && (!inscript_tag)  /* Not in tag with script inside */
  2285.               ) 
  2286.             {
  2287.               /* Not at the end */
  2288.               if (( ((int) (adr - r.adr)) ) < r.size) {
  2289.                 /* Not on a starting tag yet */
  2290.                 if (*adr != '<') {
  2291.                   char* adr_next = strchr(adr,'<');
  2292.                   /* Jump to near end (index hack) */
  2293.                   if (!adr_next) {
  2294.                     if (
  2295.                       ( (int)(adr - r.adr) < (r.size - 4)) 
  2296.                       &&
  2297.                       (r.size > 4)
  2298.                       ) {
  2299.                       adr = r.adr + r.size - 2;
  2300.                     }
  2301.                   } else {
  2302.                     adr = adr_next;
  2303.                   }
  2304.                 }
  2305.               }
  2306.             }
  2307.           }
  2308.           
  2309.           // ----------
  2310.           // Ècrire peu ‡ peu
  2311.           if ((opt.getmode & 1) && (ptr>0)) HT_ADD_ADR;
  2312.           lastsaved=adr;    // dernier Ècrit+1
  2313.           // ----------
  2314.           
  2315.           // pour les stats du shell si parsing trop long
  2316. #if HTS_ANALYSTE
  2317.           if (r.size)
  2318.             _hts_in_html_done=(100 * ((int) (adr - r.adr)) ) / (int)(r.size);
  2319.           if (_hts_in_html_poll) {
  2320.             _hts_in_html_poll=0;
  2321.             // temps ‡ attendre, et remplir autant que l'on peut le cache (backing)
  2322.             back_wait(back,back_max,&opt,&cache,HTS_STAT.stat_timestart);        
  2323.             back_fillmax(back,back_max,&opt,&cache,liens,ptr,numero_passe,lien_tot);
  2324.  
  2325.             // Transfer rate
  2326.             engine_stats();
  2327.             
  2328.             // Refresh various stats
  2329.             HTS_STAT.stat_nsocket=back_nsoc(back,back_max);
  2330.             HTS_STAT.stat_errors=fspc(NULL,"error");
  2331.             HTS_STAT.stat_warnings=fspc(NULL,"warning");
  2332.             HTS_STAT.stat_infos=fspc(NULL,"info");
  2333.             HTS_STAT.nbk=backlinks_done(liens,lien_tot,ptr);
  2334.             HTS_STAT.nb=back_transfered(HTS_STAT.stat_bytes,back,back_max);
  2335.  
  2336.             if (!hts_htmlcheck_loop(back,back_max,0,ptr,lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) {
  2337.               if (opt.errlog) {
  2338.                 fspc(opt.errlog,"info"); fprintf(opt.errlog,"Exit requested by shell or user"LF);
  2339.                 test_flush;
  2340.               } 
  2341.               exit_xh=1;  // exit requested
  2342.               XH_uninit;
  2343.               return 0;
  2344.               //adr = r.adr + r.size;  // exit
  2345.             } else if (_hts_cancel==1) {
  2346.               // adr = r.adr + r.size;  // exit
  2347.               nofollow=1;               // moins violent
  2348.               _hts_cancel=0;
  2349.             }
  2350.           }
  2351.  
  2352.           // refresh the backing system each 2 seconds
  2353.           if (engine_stats()) {
  2354.             back_wait(back,back_max,&opt,&cache,HTS_STAT.stat_timestart);        
  2355.             back_fillmax(back,back_max,&opt,&cache,liens,ptr,numero_passe,lien_tot);
  2356.           }
  2357. #endif
  2358.         } while(( ((int) (adr - r.adr)) ) < r.size);
  2359. #if HTS_ANALYSTE
  2360.         _hts_in_html_parsing=0;  // flag
  2361.         _hts_cancel=0;           // pas de cancel
  2362. #endif
  2363.         if ((opt.getmode & 1) && (ptr>0)) {
  2364.           HT_ADD_END;    // achever
  2365.         }
  2366.         //
  2367.         //
  2368.         //
  2369.       }  // if !error
  2370.       
  2371.       
  2372.       if (opt.getmode & 1) { if (fp) { fclose(fp); fp=NULL; } }
  2373.       // sauver fichier
  2374.       //structcheck(savename);
  2375.       //filesave(r.adr,r.size,savename);
  2376.       
  2377. #if HTS_ANALYSTE
  2378.     }  // analyse OK
  2379. #endif
  2380.         
  2381.